# Day 1

### Importing the data 

Link to data - https://www.kaggle.com/datasets/alfathterry/bbc-full-text-document-classification?resource=download

In [6]:
import pandas as pd 
import numpy as np 

In [7]:
df = pd.read_csv('bbc_data.csv')

In [8]:
df

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment
...,...,...
2220,Warning over Windows Word files Writing a Mic...,tech
2221,Fast lifts rise into record books Two high-sp...,tech
2222,Nintendo adds media playing to DS Nintendo is...,tech
2223,Fast moving phone viruses appear Security fir...,tech


In [9]:
df['labels'].unique()

array(['entertainment', 'business', 'sport', 'politics', 'tech'],
      dtype=object)

In [10]:
df['labels'].nunique()

5

In [11]:
df['data'].nunique()

2126

In [12]:
df[['data','labels']].drop_duplicates()

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment
...,...,...
2220,Warning over Windows Word files Writing a Mic...,tech
2221,Fast lifts rise into record books Two high-sp...,tech
2222,Nintendo adds media playing to DS Nintendo is...,tech
2223,Fast moving phone viruses appear Security fir...,tech


In [13]:
df.isna().sum()

data      0
labels    0
dtype: int64

In [14]:
df['labels'].value_counts()

labels
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

In [15]:
l = [1,2.,1,8,9]

In [16]:
dir(l)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [17]:
l.remove()

TypeError: list.remove() takes exactly one argument (0 given)

In [None]:
dir(df)

# Day 2

## Prepping the data 

### Converting all the text column to lower case

In [24]:
df['data'] = df['data'].str.lower()

In [25]:
df

Unnamed: 0,data,labels
0,musicians to tackle us red tape musicians grou...,entertainment
1,u2s desire to be number one u2 who have won th...,entertainment
2,rocker doherty in on stage fight rock singer p...,entertainment
3,snicket tops us box office chart the film adap...,entertainment
4,oceans twelve raids box office oceans twelve t...,entertainment
...,...,...
2220,warning over windows word files writing a micr...,tech
2221,fast lifts rise into record books two high spe...,tech
2222,nintendo adds media playing to ds nintendo is ...,tech
2223,fast moving phone viruses appear security firm...,tech


### Import spacy for other text cleaning purposes

In [20]:
! pip install spacy 



In [27]:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB 65.6 kB/s eta 0:03:15
     --------------------------------------- 0.0/12.8 MB 151.3 kB/s eta 0:01:25
     --------------------------------------- 0.0/12.8 MB 151.3 kB/s eta 0:01:25
     --------------------------------------- 0.0/12.8 MB 151.3 kB/s eta 0:01:25
     --------------------------------------- 0.0/12.8 MB 151.3 kB/s eta 0:01:25
     --------------------------------------- 0.0/12.8 MB 151.3 k

In [30]:
import spacy
nlp = spacy.load('en_core_web_sm')

### Remove Punctuations

In [31]:
df['data'] = df['data'].str.replace(',', ' ')
df['data'] = df['data'].str.replace('.', ' ')
df['data'] = df['data'].str.replace('-', ' ')
df['data'] = df['data'].str.replace('"', ' ')
df['data'] = df['data'].str.replace('  ',' ')

In [32]:
df

Unnamed: 0,data,labels
0,musicians to tackle us red tape musicians grou...,entertainment
1,u2s desire to be number one u2 who have won th...,entertainment
2,rocker doherty in on stage fight rock singer p...,entertainment
3,snicket tops us box office chart the film adap...,entertainment
4,oceans twelve raids box office oceans twelve t...,entertainment
...,...,...
2220,warning over windows word files writing a micr...,tech
2221,fast lifts rise into record books two high spe...,tech
2222,nintendo adds media playing to ds nintendo is ...,tech
2223,fast moving phone viruses appear security firm...,tech


### Remove stop words 

In [34]:
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 
              'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but',
               'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing',
               'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn',
               "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his',
               'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me',
               'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'now', 'o', 'of',
               'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan',
               "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than', 'that',
               "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through',
               'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what',
               'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y',
               'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']


In [35]:
def stop_word_removal(text):
    tokens = text.split(' ')
    words_without_stopwords = [x for x in tokens if x not in stop_words]
    final_sentence = ' '.join(words_without_stopwords)
    return final_sentence

In [36]:
df['clean_data'] = df['data'].apply(stop_word_removal)

In [37]:
df

Unnamed: 0,data,labels,clean_data
0,musicians to tackle us red tape musicians grou...,entertainment,musicians tackle us red tape musicians groups ...
1,u2s desire to be number one u2 who have won th...,entertainment,u2s desire number one u2 three prestigious gra...
2,rocker doherty in on stage fight rock singer p...,entertainment,rocker doherty stage fight rock singer pete do...
3,snicket tops us box office chart the film adap...,entertainment,snicket tops us box office chart film adaptati...
4,oceans twelve raids box office oceans twelve t...,entertainment,oceans twelve raids box office oceans twelve c...
...,...,...,...
2220,warning over windows word files writing a micr...,tech,warning windows word files writing microsoft w...
2221,fast lifts rise into record books two high spe...,tech,fast lifts rise record books two high speed li...
2222,nintendo adds media playing to ds nintendo is ...,tech,nintendo adds media playing ds nintendo releas...
2223,fast moving phone viruses appear security firm...,tech,fast moving phone viruses appear security firm...


### Get the vector embeddings

In [38]:
def get_embeddings(text):
    doc = nlp(text)
    return doc.vector


In [39]:
df["embeddings"] = df["clean_data"].apply(get_embeddings)

In [40]:
df

Unnamed: 0,data,labels,clean_data,embeddings
0,musicians to tackle us red tape musicians grou...,entertainment,musicians tackle us red tape musicians groups ...,"[0.22289962, -0.18602012, -0.26593497, 0.00739..."
1,u2s desire to be number one u2 who have won th...,entertainment,u2s desire number one u2 three prestigious gra...,"[0.32812732, -0.26808932, -0.053633623, -0.053..."
2,rocker doherty in on stage fight rock singer p...,entertainment,rocker doherty stage fight rock singer pete do...,"[0.33871886, -0.25064713, -0.12436289, -0.0758..."
3,snicket tops us box office chart the film adap...,entertainment,snicket tops us box office chart film adaptati...,"[0.22639275, -0.21389152, 0.043418024, 0.06659..."
4,oceans twelve raids box office oceans twelve t...,entertainment,oceans twelve raids box office oceans twelve c...,"[0.16507347, -0.22740956, 0.053380612, 0.08057..."
...,...,...,...,...
2220,warning over windows word files writing a micr...,tech,warning windows word files writing microsoft w...,"[0.38094488, -0.10959984, -0.15653251, -0.0970..."
2221,fast lifts rise into record books two high spe...,tech,fast lifts rise record books two high speed li...,"[0.29451376, -0.11099929, -0.039442364, -0.083..."
2222,nintendo adds media playing to ds nintendo is ...,tech,nintendo adds media playing ds nintendo releas...,"[0.27577457, -0.32092175, -0.05307459, -0.0075..."
2223,fast moving phone viruses appear security firm...,tech,fast moving phone viruses appear security firm...,"[0.32760388, -0.27404237, -0.03670262, -0.0733..."


In [41]:
np.vstack(df["embeddings"].values).shape

(2225, 96)