# 1) Naive Bayes.

### Importing Packages

In [30]:
import re                                                #Importing Regular Expression
import string                                            #Importing String
import numpy as np                                       #Importing Numpy
import pandas as pd                                      #Importing Pandas
import random                                            #Importing Random
from sklearn.datasets import fetch_20newsgroups          #Importing News Dataset
from sklearn.svm import SVC                              #Importing SVM
from sklearn.model_selection import GridSearchCV         #Importing Grid Search
from sklearn.metrics import accuracy_score               #Importing Accuracy Score Metric
from nltk.corpus import stopwords                        #Importing Stopwords
from nltk.tokenize import word_tokenize                  #Importing NLTK Word Tokenizer
import itertools                                         #Importing Iterator
import warnings                                          #Importing Warnings
warnings.filterwarnings('ignore')

#### Preprocessing Text Data

#### Reading the Dataset and taking Subset with two categories named as sci.med and comp.graphics

In [31]:
#importing the training data
imdb_data=pd.read_csv('IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


##### Taking a subset of the data as it takes a lot of computation time

In [32]:
imdb_data = imdb_data[:2000]

#### Preprocessing textual data to remove punctuation, stop-words

##### Function to Preprocess the data by removing Stopwords, punctuations and tokenizing the data

In [33]:
def preprocess_data(data):
    #Extracting the English stopwords and converting it into a set
    english_stop_words = set(stopwords.words('english'))
    #Making the data into the lower case string and then tokenizing the data into word list
    data = word_tokenize(data.lower())
    #Removing stopwords and punctuations from the word list
    data = [word for word in data if word not in english_stop_words and word.isalpha()]
    #Returning the final processed data list
    return data

##### Extracting Movie Review and its Target label "Sentiment"

In [34]:
movie_review = imdb_data['review']
sentiment = imdb_data['sentiment']

##### Applying Preprocessing step on all the News items

In [35]:
#Initializing an empty list to store processed movie reviews
processed_movie_reviews = []
#Iterating for each movie review
for review in movie_review:
    #Applying preprocessing on current movie review
    processed_movie_reviews.append(preprocess_data(review))

#### Implementing a bag-of-words feature representation for each text sample

##### Function to create a Word Frequency Dictionary from the Provided Documents

In [36]:
def word_freq_dictionary(data, freq_dict):
    #Iterating for all words in the document
    for word in data:
        #If word is already present in the dictionary then add 1
        if word in freq_dict:
            freq_dict[word] += 1
        #If word is not present, then create a new key and assign 1 as it's value
        else:
            freq_dict[word] = 1
    #Returning the created word frequency dictionary
    return freq_dict

##### Function to create a Binary vector for a document based on Bag of Word Representation

In [37]:
def bag_of_words(data, freq_dict):
    #Initializing a vector with zeros having the length equal to total unique words in the corpus
    vector = np.zeros(shape=(len(freq_dict.keys()),))
    #Iterating over all words in the document
    for word in data:
        #Placing 1 in the vector based on index assigned to that word in the dictionary
        if word in freq_dict.keys():
            vector[list(freq_dict.keys()).index(word)] = 1
    #Returning the document vector
    return vector

##### Converting each document in the dataset into a Bag of Word Representation

In [38]:
#Total Features to consider for Processing
features = 1000

In [39]:
#Creating a dictionary to store all unique words and there respective counts in the entire corpus
corpus_word_freq_dictionary = {}
#Iterating for each document in the dataset
for doc in processed_movie_reviews:
    #Updating the dictionary for each document
    corpus_word_freq_dictionary = word_freq_dictionary(doc, corpus_word_freq_dictionary)
#Sorting the dictionary in Descending Order
corpus_word_freq_dictionary = dict(sorted(corpus_word_freq_dictionary.items(), key=lambda item: item[1], reverse=True))
#Extracting only required words based on feature value
corpus_word_freq_dictionary = dict(itertools.islice(corpus_word_freq_dictionary.items(), features))

In [40]:
#Initializing a empty list to store the Bag of Word representation for each document
movie_review_bog = []
#Iterating for each document in the dataset
for document in processed_movie_reviews:
    #Creating a bag of word representation vector and appending it into the final list
    movie_review_bog.append(bag_of_words(document, corpus_word_freq_dictionary))

##### Displaying the created Bag of Word Representation in the form of Dataframe

In [41]:
movie_review_bog_dataframe = pd.DataFrame(movie_review_bog, columns=corpus_word_freq_dictionary.keys())
movie_review_bog_dataframe.head()

Unnamed: 0,br,movie,film,one,like,good,would,even,see,really,...,incredibly,convincing,terms,recommended,event,genius,sadly,red,writers,cover
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Implementing a TF-IDF feature representation for each text sample

##### Function to calculate the Term Frequency (TF) for a word in a document

In [42]:
def term_frequency(document, word):
    #tf = (total frequency of word in a document) / (total words in a document)
    return document[word]/sum(document.values())

##### Function to calculate the Inverse Document Frequency (IDF) for a word in the entire Corpus

In [43]:
def inverse_document_frequency(total_doc_freq_dictionary, word, total_documents):
    #idf = log(total documents / total frequency of word in all documents)
    return np.log(total_documents/total_doc_freq_dictionary[word] + 1)

##### Function to calculate the TF-IDF of a all the words individually in the entire corpus

In [44]:
def tf_idf(document, total_doc_freq_dictionary, total_documents):
    #Initializing a vector of zeros with a lenght of total unique words in the entire corpus
    vector = np.zeros(shape=(len(total_doc_freq_dictionary.keys()),))
    #Iterating for each word in the document
    for word in document.keys():
        #Checking if word exist in our feature set
        if word in total_doc_freq_dictionary.keys():
            # tf-idf = tf * idf
            tf_idf = term_frequency(document, word) * inverse_document_frequency(total_doc_freq_dictionary, word, total_documents)
            #Inserting the calculated tf-idf for that word in the vector
            vector[list(total_doc_freq_dictionary.keys()).index(word)] = tf_idf 
    #Returning the final vector containing tf-idf values
    return vector

##### Converting each document in the dataset into a TF-IDF Representation

In [45]:
#Initializing a vector to store tf-idf vectors for each document
movie_review_tfidf_vectors = []
#Iterating for each documents
for document in processed_movie_reviews:
    #Creating a word frequency dictionary for current document
    current_document_dict = word_freq_dictionary(document, {})
    #Calculating and Appending the tf-idf vector in the final vector
    movie_review_tfidf_vectors.append(tf_idf(current_document_dict, corpus_word_freq_dictionary, len(processed_movie_reviews)))

##### Displaying the created TF-IDF Representation in the form of Dataframe

In [46]:
moview_reviews_tfidf_dataframe = pd.DataFrame(movie_review_tfidf_vectors, columns=corpus_word_freq_dictionary.keys())
moview_reviews_tfidf_dataframe.head()

Unnamed: 0,br,movie,film,one,like,good,would,even,see,really,...,incredibly,convincing,terms,recommended,event,genius,sadly,red,writers,cover
0,0.007883,0.0,0.0,0.004173,0.0,0.0,0.012555,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.015399,0.0,0.0,0.008151,0.0,0.0,0.0,0.0,0.013259,0.01365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.01051,0.0,0.0,0.008345,0.0,0.0,0.0,0.012976,0.013575,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.019766,0.020536,0.015418,0.0,0.012624,0.0,0.0,0.0,0.017019,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.013904,0.003611,0.008134,0.033118,0.0,0.016335,0.0,0.0,0.008979,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Splitting the dataset randomly into train/validation/test splits according to ratios 80%:10%:10%

In [47]:
def split_dataset(moview_review, sentiment, train_ratio, validation_ratio):
    #Combining the Feature columns and the target column into a single list
    combined = list(zip(moview_review, sentiment))
    #Randomly shuffle the rows in the list
    random.shuffle(combined)
    #Calculating the training rows and validation rows
    train_rows = int(len(combined) * train_ratio)
    validation_rows = int(len(combined) * validation_ratio)
    #Extracting X matrix and Y matrix from the combined list
    X , y = list(zip(*combined))
    X, y = list(X), list(y)
    #Splitting X and y matrices into Training, Validation and Test set
    X_train, X_val, X_test = X[:train_rows], X[train_rows:train_rows+validation_rows], X[train_rows+validation_rows:]
    y_train, y_val, y_test = y[:train_rows], y[train_rows:train_rows+validation_rows], y[train_rows+validation_rows:]
    #Returning all the subsets
    return X_train, X_val, X_test, y_train, y_val, y_test

### Exercise 1: Implementing Naive Bayes Classifier for Text Data

#### Class to Represent the Naive Bayes Algorithm

In [48]:
class Naive_bayes:
    #Contructor function
    def __init__(self,feature_representation, dataset, target):
        #Checking if the feature representation value is valid otherwise Raising Exception
        if feature_representation not in ['bog', 'tfidf']:
            raise Exception('Invalid value provided for Feature Representation')
        #Feature Representation type - Bag of Words (bog) or TF-IDF (tfidf)
        self.feature_representation = feature_representation
        #Feature Columns of the Dataset
        self.dataset = np.array(dataset)
        #Target/Label Column of the Dataset
        self.target = np.array(target)
        #Unique Target/Label values
        self.unique_target = list(set(target))
        #Current Accuarcy of the Model
        self.accuracy = 0
        #Combined Dataset containing both, the Feature Columns and the Target Column
        self.combined = np.concatenate((self.dataset, self.target.reshape(-1,1)), axis = 1)
    
    #Function to find Predictions on the dataset provided and calculate the Accuracy
    def predict(self):
        #Iterating over all the different documents in the dataset
        for index, document in enumerate(self.combined):
            #Initializing an empty list to store the predicted probability for each target value
            target_probability = []
            #Iterating over all unique target values for calculating there probabilities
            for target in self.unique_target:
                #Based on Feature representation type, calculating the probabiltiy
                if self.feature_representation == 'bog':
                    target_probability.append(self.calculate_probability_bog(document, target))
                elif self.feature_representation == 'tfidf':
                    target_probability.append(self.calculate_probability_tfidf(document, target))
            #Normalizing each probability so that it sums to 1
            target_probability = list(map(lambda x : x / (sum(target_probability) + 1),target_probability))
            #Extracting the class with the maximum Probability
            predicted_class = self.unique_target[target_probability.index(max(target_probability))]
            #Checking if the predicted class is equal to the actual class
            if predicted_class == self.target[index]:
                self.accuracy += 1
        #Calculating its final accuracy
        self.accuracy /= len(self.combined)


    #A private function to calculate the Probability for a document represented using Bag of Words
    def calculate_probability_bog(self, document, target):
        #Calculating the probability for a class itself
        prob_class = len(self.target[np.where(self.target == target)]) / len(self.target)
        #Initializing the prior probability for each Word in the document
        prior_word_probability = 1
        #Iterating over each word in the document
        for i in range(len(document) - 1):
            #Calculating the probability only if the word exist in the document
            if document[i] == 1:
                #Calculating the Prior probability of the word given the class
                # P(w1 | target) = (Number of times the word occurs in all the document given the class) / (Number of time word occurs in all documents)
                numerator = len(self.combined[np.where((self.combined[:,i] == 1) & (self.combined[:,-1] == target))])
                denominator = len(self.combined[np.where(self.combined[:,-1] == target)])
                #Multiplying the current word prior probability with other words
                prior_word_probability *= (numerator / denominator)
        #Returning the final probability -> P(class) * P(W | class)
        return prior_word_probability * prob_class
    
    #A private function to calculate the Probability for a document represented using TF-IDF
    def calculate_probability_tfidf(self, document, target):
        #Calculating the probability for a class itself
        # P(target) = (Number of times that class appears in the dataset) / (total documents)
        prob_class = len(self.target[np.where(self.target == target)]) / len(self.target)
        #Initializing the prior probability for each Word in the document
        prior_word_probability = 1
        #Iterating over each word in the document
        for i in range(len(document) - 1):
            #Only calculating the probability if the word exist in the document
            if document[i] == 1:
                #Calculating the Prior probability of the word given the class
                # P(w1 | target) = (Number of times the word occurs in all the document given the class) / (Number of time word occurs in all documents)
                numerator = sum(self.combined[np.where((self.combined[:,i] == 1) & (self.combined[:,-1] == target))])
                denominator = sum(self.combined[np.where(self.combined[:,-1] == target)])
                #Multiplying the current word prior probability with other words
                prior_word_probability *= (numerator / denominator)
        #Returning the final probability -> P(class) * P(W | class)
        return prior_word_probability * prob_class
    
    #Function to display the Accuracy of the Model
    def score(self):
        feature_representation = 'Bag of Words' if self.feature_representation == 'bog' else 'TF-IDF'
        return 'The Accuracy for Naive Bayes using {} Representation is {:.2f}%'.format(feature_representation, self.accuracy * 100)    

#### Using Bag of Word Representation

##### Splitting the Dataset with Bag of Word Representation into Train, Validation and Test sets

In [49]:
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(movie_review_bog, sentiment, 0.8, 0.1)

##### Creating and Fitting the Naive Bayes model on the training, Validation and Test Sets
##### Calculating and Displaying the Training, Validation and Test Accuracies

In [50]:
nb_model_train = Naive_bayes('bog', X_train, y_train)
nb_model_train.predict()
print('Training Accuracy: \n{}'.format(nb_model_train.score()))

nb_model_validation = Naive_bayes('bog', X_val, y_val)
nb_model_validation.predict()
print('\nValidation Accuracy: \n{}'.format(nb_model_validation.score()))

nb_model_test = Naive_bayes('bog', X_test, y_test)
nb_model_test.predict()
print('\nTest Accuracy: \n{}'.format(nb_model_test.score()))

Training Accuracy: 
The Accuracy for Naive Bayes using Bag of Words Representation is 50.25%

Validation Accuracy: 
The Accuracy for Naive Bayes using Bag of Words Representation is 51.00%

Test Accuracy: 
The Accuracy for Naive Bayes using Bag of Words Representation is 50.50%


#### Using TF-IDF Representation

##### Splitting the Dataset with TF-IDF Representation into Train, Validation and Test sets

In [51]:
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(movie_review_tfidf_vectors, sentiment, 0.8, 0.1)

##### Creating and Fitting the Naive Bayes model on the training, Validation and Test Sets

##### Calculating and Displaying the Training, Validation and Test Accuracies

In [52]:
nb_model_train = Naive_bayes('tfidf', X_train, y_train)
nb_model_train.predict()
print('Training Accuracy: \n{}'.format(nb_model_train.score()))

nb_model_validation = Naive_bayes('tfidf', X_val, y_val)
nb_model_validation.predict()
print('\nValidation Accuracy: \n{}'.format(nb_model_validation.score()))

nb_model_test = Naive_bayes('tfidf', X_test, y_test)
nb_model_test.predict()
print('\nTest Accuracy: \n{}'.format(nb_model_test.score()))

Training Accuracy: 
The Accuracy for Naive Bayes using TF-IDF Representation is 50.19%

Validation Accuracy: 
The Accuracy for Naive Bayes using TF-IDF Representation is 52.00%

Test Accuracy: 
The Accuracy for Naive Bayes using TF-IDF Representation is 56.00%


#### The Accuracy of TF-IDF is low because we have only taken 1000 features.If we increase the number of features, the accuracy can improve subsequently.

### Implementing SVM Classifier via Scikit-Learn

#### Defining Hyperparameter Grid for Grid Search

In [53]:
hyperparameter_grid = {'C' : [0.01,0.02,0.03],'kernel': ['linear', 'rbf'],'gamma': ['scale','auto']}

#### Using Bag of Word Representation

##### Splitting the Dataset with Bag of Word Representation into Train, Validation and Test sets

In [54]:
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(movie_review_bog, sentiment, 0.8, 0.1)

##### Creating and Fitting the SVM model on the training set using Grid Search and different Hyperparameter combination

In [55]:
#Initializing a SVM model with the random seed
random_seed = 2023
svm = SVC(random_state=random_seed)
#Creating a Grid Seach object with the SVM model and K-fold Cross validation
model = GridSearchCV(svm, hyperparameter_grid, n_jobs=-1, cv=5, scoring='accuracy', return_train_score=True)
#Fitting the training dataset on SVM with different Hyperparameters
model.fit(X_train, y_train)

In [56]:
print('Best Hyperparameter combination found for Bag of Word Representation after applying Grid Search: \n{}'.format(model.best_params_))

Best Hyperparameter combination found for Bag of Word Representation after applying Grid Search: 
{'C': 0.02, 'gamma': 'scale', 'kernel': 'linear'}


##### Calculating and Displaying the Validation and Test Accuracies

In [57]:
print('Validation Accuracy on best Hyperparameters: {:.2f}'.format(accuracy_score(y_val, model.predict(X_val)) * 100))
print('Test Accuracy on best Hyperparameters: {:.2f}'.format(accuracy_score(y_test, model.predict(X_test)) * 100))

Validation Accuracy on best Hyperparameters: 82.00
Test Accuracy on best Hyperparameters: 84.00


#### Using TF-IDF Representation

##### Splitting the Dataset with TF-IDF Representation into Train, Validation and Test sets

In [58]:
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(movie_review_tfidf_vectors, sentiment, 0.8, 0.1)

##### Creating and Fitting the SVM model on the training set using Grid Search and different Hyperparameter combination

In [59]:
#Initializing a SVM model with the random seed
svm = SVC(random_state=random_seed)
#Creating a Grid Seach object with the SVM model and K-fold Cross validation
model1 = GridSearchCV(svm, hyperparameter_grid, n_jobs=-1, cv=5, return_train_score=True)
#Fitting the training dataset on SVM with different Hyperparameters
model1.fit(X_train, y_train)

In [60]:
print('Best Hyperparameter combination found for TF-IDF Representation after applying Grid Search: \n{}'.format(model1.best_params_))

Best Hyperparameter combination found for TF-IDF Representation after applying Grid Search: 
{'C': 0.01, 'gamma': 'scale', 'kernel': 'rbf'}


##### Calculating and Displaying the Validation and Test Accuracies

In [61]:
print('Validation Accuracy on best Hyperparameters: {:.2f}'.format(accuracy_score(y_val, model1.predict(X_val)) * 100))
print('Test Accuracy on best Hyperparameters: {:.2f}'.format(accuracy_score(y_test, model1.predict(X_test)) * 100))

Validation Accuracy on best Hyperparameters: 47.50
Test Accuracy on best Hyperparameters: 49.50
