# 1 Introduction to Natural Language Processing.

In [1]:
# pip install textblob

In [2]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
# import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
# from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import os
print(os.listdir("/Users/sreyashisaha/Desktop/Semester2/ML_LAB/Week10/"))
import warnings
warnings.filterwarnings('ignore')

['Question2.ipynb', 'Sheet10.pdf', 'Pegasos.ipynb', 'Question1.ipynb', 'creditcard.csv', 'iris.data', 'IMDB Dataset.csv', 'Q1try1.ipynb', 'smart_grid_stability_augmented.csv', 'Q3_Final.ipynb']


#### In this lab, you will be working with the IMBD movie review dataset to perform various natural language processing tasks. Using the provided dataset, you will need to:

##### Loading the dataset

In [4]:
#importing the training data
imdb_data=pd.read_csv('IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [5]:
#Summary of the dataset
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [6]:
#sentiment count
# Count the number of positive and negative classes in the dataset
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

### 1. Perform tokenization on the review text.

In [7]:
#Tokenization of text
tokenizer=ToktokTokenizer()
text = tokenizer.tokenize(imdb_data['review'])
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')
print("Some stop words are:\n")
stopword_list[0:10]

Some stop words are:



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### 2. Remove stop words from the tokenized text.

In [8]:
# #set stopwords to english
# stop=set(stopwords.words('english'))
# print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
    
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

### 3. Use regular expressions to clean the text, removing any HTML tags, emails, and other unnecessary information.

In [9]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply functions on review column
imdb_data['review']=imdb_data['review'].apply(denoise_text)

In [10]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(remove_special_characters)

In [10]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
imdb_data['review']=imdb_data['review'].apply(simple_stemmer)

In [12]:
#normalized train reviews
norm_train_reviews=imdb_data.review[:40000]
# norm_train_reviews[0]
# # convert dataframe to string
# norm_train_string=norm_train_reviews.to_string()
# Spelling correction using Textblob
# norm_train_spelling=TextBlob(norm_train_string)
# norm_train_spelling.correct()
# # Tokenization using Textblob
# norm_train_words=norm_train_spelling.words
# norm_train_words

In [13]:
#Normalized test reviews
norm_test_reviews=imdb_data.review[40000:]
# norm_test_reviews[45005]
##convert dataframe to string
# norm_test_string=norm_test_reviews.to_string()
#spelling correction using Textblob
# norm_test_spelling=TextBlob(norm_test_string)
# norm_test_spelling.correct()
#Tokenization using Textblob
# norm_test_words=norm_test_spelling.words
#norm_test_words

In [14]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names   

BOW_cv_train: (40000, 6600374)
BOW_cv_test: (10000, 6600374)


In [15]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 6600374)
Tfidf_test: (10000, 6600374)


### 4. Convert the cleaned data into a TF-IDF and BOW representation from scratch.

In [None]:
import math

 BoW and TF-IDF are techniques that help us convert text sentences into numeric vectors.

# BoW model by hand

In [25]:
df = pd.read_csv('IMDB Dataset.csv')['review'][:1000]
df.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [51]:

import numpy
import re
def tokenize(sentences):
    words = []
    for sentence in sentences:
        w = word_extraction(sentence)
        words.extend(w)
        
    words = sorted(list(set(words)))
    return words

def word_extraction(sentence):
    ignore = ['a', "the", "is"]
    words = re.sub("[^\w]", " ",  sentence).split()
    cleaned_text = [w.lower() for w in words if w not in ignore]
    return cleaned_text    
    
def generate_bow(allsentences):    
    # vocab = tokenize(allsentences)
    # print("Word List for Document \n{0} \n".format(vocab));

    # for sentence in allsentences:
    words = word_extraction(allsentences)
        # print(words)
    bag_vector = numpy.zeros(len(words))
        # for w in words:
        #     for i,word in enumerate(vocab):
        #         if word == w: 
        #             bag_vector[i] += 1
    for w in words:
        for i,word in enumerate(words):
            if word == w: 
                bag_vector[i] += 1
                    
    print("\n{0}\n".format(numpy.array(bag_vector)))

In [52]:
# word_extraction(df[0])

In [53]:
df[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [54]:
generate_bow(df[0])


[1. 7. 2. 1. 1. 1. 4. 1. 2. 2. 1. 6. 2. 3. 3. 2. 1. 1. 2. 2. 4. 3. 1. 2.
 1. 5. 4. 6. 6. 2. 2. 1. 4. 2. 4. 1. 6. 3. 2. 1. 6. 1. 1. 7. 4. 1. 1. 3.
 2. 1. 2. 1. 1. 4. 3. 3. 3. 5. 1. 1. 3. 1. 3. 3. 1. 1. 1. 5. 1. 6. 1. 1.
 3. 4. 2. 1. 3. 1. 1. 7. 2. 6. 6. 6. 1. 6. 4. 4. 1. 1. 6. 1. 1. 1. 1. 1.
 6. 1. 1. 3. 1. 2. 1. 1. 1. 7. 3. 2. 1. 1. 1. 1. 1. 6. 1. 1. 3. 1. 3. 2.
 3. 1. 1. 2. 1. 6. 1. 1. 1. 1. 1. 1. 1. 1. 6. 2. 3. 1. 1. 1. 1. 1. 6. 1.
 1. 2. 1. 1. 2. 6. 6. 6. 1. 2. 1. 1. 7. 3. 2. 6. 1. 4. 6. 1. 2. 2. 1. 1.
 3. 1. 3. 1. 1. 1. 5. 1. 1. 3. 1. 3. 1. 6. 1. 3. 1. 1. 2. 2. 2. 6. 1. 1.
 2. 4. 4. 3. 1. 6. 3. 1. 6. 1. 3. 2. 6. 3. 1. 5. 6. 2. 4. 6. 1. 2. 6. 1.
 1. 5. 6. 6. 1. 1. 6. 2. 1. 7. 1. 4. 3. 2. 4. 2. 1. 1. 1. 2. 3. 2. 1. 1.
 5. 1. 2. 2. 3. 1. 3. 1. 6. 2. 2. 5. 6. 1. 1. 1. 1. 2. 1. 1. 1. 3. 1. 2.
 6. 1. 1. 7. 1. 1. 3. 3. 1. 2. 6. 3. 1. 1. 1. 5. 2. 1. 1. 1. 1. 3. 1. 2.
 3. 1. 5. 1. 1. 1.]



# TF-IDF model by hand

Since it is a very large dataset and it takes a lot of computation time I have taken a small subset from the dataset to convert the data into BOW representation and TF-IDF.

In [None]:
sentences = []
word_set = []
# Finding all the unique words from the 
for sent in imdb_data.review[49000:]:
    words = [word.lower() for word in word_tokenize(sent) if word.isalpha()]
    sentences.append(words)
    for word in words:
        if word not in word_set:
            word_set.append(word)

In [59]:
total_documents = len(sentences)
total_documents

1000

In [60]:
# Creating an index for each word in our vocab.
index_dict = {} #Dictionary to store index for each word
i = 0
for word in word_set:
    index_dict[word] = i
    i += 1

In [61]:
#Create a count dictionary
 
def count_dict(sentences):
    word_count = {}
    for word in word_set:
        word_count[word] = 0
        for sent in sentences:
            if word in sent:
                word_count[word] += 1
    return word_count
 
word_count = count_dict(sentences)

### Term Frequency (TF)
Let’s first understand Term Frequent (TF). It is a measure of how frequently a term, t, appears in a document, d:<br>

Term Frequency (tf) formula = (n is the number of times the term “t” appears in the document “d”)/(total no of terms in teh document)

In [62]:
#Term Frequency
def termfreq(document, word):
    N = len(document)
    occurance = len([token for token in document if token == word])
    return occurance/N

### Inverse Document Frequency (IDF)
IDF is a measure of how important a term is. We need the IDF value because computing just the TF alone is not sufficient to understand the importance of words:<br>
IDF(‘word’) =  log(number of documents/number of documents containing the word ‘word’)

In [63]:
#Inverse Document Frequency
 
def inverse_doc_freq(word):
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(total_documents/word_occurance)

Vectorize the words

In [64]:
def tf_idf(sentence):
    tf_idf_vec = np.zeros((len(word_set),))
    for word in sentence:
        tf = termfreq(sentence,word)
        idf = inverse_doc_freq(word)
         
        value = tf*idf
        tf_idf_vec[index_dict[word]] = value 
    return tf_idf_vec

In [66]:
#TF-IDF Encoded text corpus
vectors = []
for sent in sentences:
    vec = tf_idf(sent)
    vectors.append(vec)
 
print(vectors)

[array([0.05053391, 0.02888858, 0.02537992, ..., 0.        , 0.        ,
       0.        ]), array([0.05233869, 0.        , 0.05257269, ..., 0.        , 0.        ,
       0.        ]), array([0., 0., 0., ..., 0., 0., 0.]), array([0.        , 0.        , 0.01549511, ..., 0.        , 0.        ,
       0.        ]), array([0.03011267, 0.        , 0.0302473 , ..., 0.        , 0.        ,
       0.        ]), array([0.        , 0.01137243, 0.        , ..., 0.        , 0.        ,
       0.        ]), array([0.       , 0.0119114, 0.       , ..., 0.       , 0.       ,
       0.       ]), array([0., 0., 0., ..., 0., 0., 0.]), array([0.        , 0.        , 0.03532884, ..., 0.        , 0.        ,
       0.        ]), array([0.        , 0.        , 0.00596771, ..., 0.        , 0.        ,
       0.        ]), array([0.        , 0.        , 0.00306674, ..., 0.        , 0.        ,
       0.        ]), array([0.        , 0.        , 0.01064122, ..., 0.        , 0.        ,
       0.        ]),