In [1]:
import numpy as np
import pandas as pd

In [2]:
imbd = pd.read_csv("./IMDB Dataset.csv/IMDB Dataset.csv")

In [3]:
imbd.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Text Preprocessing

In [4]:
# Problem 1

# Apply all the preprocessing techniques that you think are necessary

In [5]:
# html tags
# spell checker
# lower casing
# removing puntuations
# tokenize
# remove stop words
# stemming / lemmatize

In [6]:
sample = imbd["review"][1]

In [7]:
#!pip install -U textblob

In [8]:
import re
from textblob import TextBlob
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SREEMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:

class text_preprocessor:
    def __init__(self,text_data):
        self.text_data = text_data
        
    def remove_tags(self):
        pattern = re.compile("<.*?>")
        return pattern.sub(r"",self.text)
    
    def spell_checker(self):
        textblob = TextBlob(self.text)
        return str(textblob.correct())
    
    def lower_casing(self):
        return str.lower(self.text)
    
    def remove_punctuation(self):
        exclude = string.punctuation
        return self.text.translate(str.maketrans('','',exclude))
    
    def tokenize(self):
        self.text_lst = word_tokenize(self.text)
        
    def remove_stopwords(self):
        stop_words = stopwords.words("english")
        for word in self.text_lst:
            if word in stop_words:
                self.text_lst.remove(word)
    
    def stemming(self):
        new_text = []
        stemmer = PorterStemmer()
        for word in self.text_lst:
            stem_word = stemmer.stem(word)
            new_text.append(stem_word)
        self.text_lst = new_text
        
    def process(self):
        new_text_data = []
        start_time = time()
        for text in self.text_data:
            self.text = text
            self.text = self.lower_casing()
            self.text = self.remove_tags()
            #self.text = self.spell_checker()
            self.text = self.remove_punctuation()
            self.tokenize()
            self.remove_stopwords()
            self.stemming()
            result = " ".join(self.text_lst)
            new_text_data.append(result)
        end_time = time()
        print("Time Taken - ", end_time-start_time)
        return pd.Series(new_text_data,name=self.text_data.name)

In [11]:
from time import time

In [12]:
from sklearn.model_selection import train_test_split
imbd_rem,imbd_sample = train_test_split(imbd,test_size=0.1,random_state=42)

In [13]:
imbd_sample = imbd_sample.reset_index()
imbd_sample.drop("index",axis=1,inplace=True)

In [14]:
imbd_sample.head()

Unnamed: 0,review,sentiment
0,I really liked this Summerslam due to the look...,positive
1,Not many television shows appeal to quite as m...,positive
2,The film quickly gets to a major chase scene w...,negative
3,Jane Austen would definitely approve of this o...,positive
4,Expectations were somewhat high for me when I ...,negative


In [15]:
imbd_sample.shape

(5000, 2)

In [16]:
text_prep = text_preprocessor(imbd_sample["review"])

In [17]:
imbd_sample_pre = text_prep.process()

Time Taken -  22.761300563812256


In [18]:
imbd_sample["review"] = imbd_sample_pre

## Text Representation

In [19]:
# Problem 2

# Find out the number of words in the entire corpus and also the 
# total number of unique words(vocabulary) using just python

In [20]:
imbd_sample.head()

Unnamed: 0,review,sentiment
0,realli like summerslam due look arena curtain ...,positive
1,mani televis show appeal quit mani differ kind...,positive
2,film quickli get major chase scene ever increa...,negative
3,jane austen would definit approv thi onegwynet...,positive
4,expect somewhat high went see movi all thought...,negative


In [21]:
def corpus_word_count(text_col):
    total_word_count = 0
    for doc in text_col:
        text = word_tokenize(doc)
        total_word_count += len(text)
    return total_word_count

In [22]:
Total_word_count = corpus_word_count(imbd_sample["review"])

In [23]:
def vocabulary(text_col):
    vocabulary = dict()
    for doc in text_col:
        for word in word_tokenize(doc):
            try:
                vocabulary[word] = vocabulary[word] + 1
            except Exception as e:
                vocabulary[word] = 1
    return vocabulary

In [24]:
vocabulary = vocabulary(imbd_sample["review"])

In [25]:
vocabulary_num = len(list(vocabulary.keys()))
vocabulary_num

41709

In [26]:
# Problem 3

# Apply One Hot Encoding

### 1. Bag Of Words (BOW) - 

In [27]:
# Problem 4

# Apply bag words and find the vocabulary also find the times each word 
# has occured

In [28]:
X = imbd_sample["review"]
y = imbd_sample["sentiment"]

In [29]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,
                                                 random_state=42)

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
c_vector = CountVectorizer()

In [32]:
X_train_bow = c_vector.fit_transform(X_train).toarray()

In [34]:
X_train_bow.shape

(4000, 36288)

In [35]:
c_vector.vocabulary_

{'yet': 36004,
 'anoth': 1981,
 'gay': 13028,
 'film': 11713,
 'ruin': 27238,
 'asinin': 2474,
 'polit': 24681,
 'luigi': 19185,
 'final': 11878,
 'speech': 29887,
 'about': 824,
 'sent': 28332,
 'run': 27257,
 'of': 22626,
 'theatr': 31793,
 'it': 16719,
 'bumperstick': 4902,
 'epigram': 10551,
 'read': 26055,
 'comic': 6757,
 'book': 4251,
 'wa': 34583,
 'base': 3191,
 'for': 12256,
 'much': 21345,
 'entertain': 10477,
 'experi': 10992,
 'long': 18915,
 'west': 35126,
 'end': 10314,
 'charm': 5891,
 'recast': 26151,
 'margaret': 19665,
 'rutherford': 27295,
 'headmistress': 14632,
 'miss': 20733,
 'whitchurch': 35236,
 'thi': 31934,
 'financi': 11885,
 'success': 30836,
 'adapt': 1078,
 'made': 19337,
 '1950all': 252,
 'interior': 16463,
 'shot': 28792,
 'took': 32527,
 'place': 24423,
 'riversid': 26908,
 'studio': 30680,
 'hammersmith': 14297,
 'london': 18900,
 'exterior': 11063,
 'scene': 27714,
 'locat': 18850,
 'public': 25490,
 'girl': 13279,
 'school': 27812,
 'near': 21759,


### 2. N-Gram (Bag of N-grams) -

In [36]:
# Problem 5

# Apply bag of bi-gram and bag of tri-gram and write down your observation 
# about the dimensionality of the vocabulary

In [53]:
c_vector_bi_gram = CountVectorizer(ngram_range=(2,2))

In [38]:
X_train_bi_gram = c_vector_bi_gram.fit_transform(X_train).toarray()

In [None]:
X_train_bi_gram_sample = c_vector_bi_gram.fit_transform(X_train).toarray()

In [39]:
X_train_bi_gram.shape

(4000, 371483)

In [38]:
c_vector_bi_gram.vocabulary_

{'yet anoth': 369125,
 'anoth gay': 18972,
 'gay film': 127044,
 'film ruin': 115185,
 'ruin asinin': 269331,
 'asinin polit': 24615,
 'polit luigi': 245565,
 'luigi final': 192030,
 'final speech': 116394,
 'speech about': 296795,
 'about sent': 3637,
 'sent run': 280131,
 'run of': 269662,
 'of theatr': 225081,
 'theatr it': 321568,
 'it bumperstick': 167064,
 'bumperstick epigram': 46689,
 'epigram read': 99465,
 'read comic': 257212,
 'comic book': 63623,
 'book wa': 41772,
 'wa base': 349890,
 'base for': 30848,
 'for much': 120527,
 'much entertain': 212230,
 'entertain experi': 98799,
 'long run': 188760,
 'run west': 269742,
 'west end': 357291,
 'end charm': 96678,
 'charm film': 56083,
 'film recast': 115100,
 'recast margaret': 259530,
 'margaret rutherford': 196886,
 'rutherford headmistress': 269993,
 'headmistress miss': 143935,
 'miss whitchurch': 205317,
 'whitchurch thi': 359145,
 'thi financi': 325014,
 'financi success': 116489,
 'success adapt': 306155,
 'adapt made

In [40]:
c_vector_tri_gram = CountVectorizer(ngram_range=(3,3))

In [41]:
X_train_tri_gram = c_vector_tri_gram.fit_transform(X_train).toarray()

In [42]:
X_train_tri_gram.shape

(4000, 556459)

### 3. TF-IDF (Term Frequency - Inverse Document Frequency) -

In [43]:
# Problem 6

# Apply tf-idf and find out the idf scores of words, 
# also find out the vocabulary.

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
tf_idf = TfidfVectorizer()

In [50]:
X_train_tf_idf = tf_idf.fit_transform(X_train).toarray()

In [51]:
X_train_tf_idf.shape

(4000, 36288)

In [68]:
count = 0
for val in list(X_train_tf_idf[0]):
    count+=1
    if (val) == 0.0:
        continue
    else:
        
        print(val, " - ", count)

0.13258288792525524  -  825
0.12000762576606228  -  1982
0.32641339738723923  -  2475
0.16392024277122585  -  3192
0.15758624317620304  -  4252
0.3550239659197644  -  4903
0.17021595491358207  -  6758
0.1366396337234704  -  10478
0.3382878562039245  -  10552
0.15741389918745335  -  10993
0.06223428882203297  -  11714
0.135410471176066  -  11879
0.11200993248389757  -  12257
0.20597854338544347  -  13029
0.06190988169964504  -  16720
0.3382878562039245  -  19186
0.09438653166582243  -  21346
0.08374429346843329  -  22627
0.18534219887501563  -  24682
0.14184057975437256  -  26056
0.19977544627973418  -  27239
0.14195868090887456  -  27258
0.21975547373815646  -  28333
0.22299449366698773  -  29888
0.21822654615731227  -  31794
0.07841703124255264  -  34584
0.13844609325629187  -  36005
