In [1]:
import numpy as np
import pandas as pd
import re
import nltk

In [2]:
data = pd.read_csv(r"C:\Users\Souvik Bhattacharyya\Desktop\Mentorship\Data\sentiment_analysis.csv")
data

Unnamed: 0,class,text
0,1,films adapted from comic books have had plent...
1,1,every now and then a movie comes along from a...
2,1,you ve got mail works alot better than it des...
3,1,jaws is a rare film that grabs your atte...
4,1,moviemaking is a lot like being the general m...
...,...,...
1995,0,if anything stigmata should be taken as...
1996,0,john boorman s zardoz is a goofy cinemati...
1997,0,the kids in the hall are an acquired taste ...
1998,0,there was a time when john carpenter was a gr...


In [3]:
data['class'].value_counts()

0    1000
1    1000
Name: class, dtype: int64

In [4]:
corpus = data["text"]

In [5]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case/ white space/ special char removal
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    
    # tokenization
    tokens = wpt.tokenize(doc)
    
    # filtretion
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = " ".join(filtered_tokens)
    return(doc)

normalize_corpus = np.vectorize(normalize_document)

In [6]:
norm_corpus = normalize_corpus(corpus)

In [7]:
# bag of word method
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df = 0., max_df = 1.)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
vocab = cv.get_feature_names()
cv_matrix.shape

(2000, 39132)

In [8]:
np.unique(cv_matrix)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 36, 37,
       40, 41, 58], dtype=int64)

In [9]:
data_matrix = np.hstack((cv_matrix, data['class'].values.reshape(2000, 1)))
vocab.append("label")
filtered_data = pd.DataFrame(data_matrix, columns=vocab)
filtered_data.head()

Unnamed: 0,aa,aaa,aaaaaaaaah,aaaaaaaahhhh,aaaaaah,aaaahhhs,aahs,aaliyah,aalyah,aamir,...,zulu,zundel,zurg,zus,zweibel,zwick,zwigoff,zycie,zzzzzzz,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
# Splitting the dataset into test and train data
from sklearn.model_selection import train_test_split
x = cv_matrix
y = data["class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [11]:
N = len(vocab) -1
count_matrix = np.ones((2, N))
y_count_0 = 0
y_count_1 = 0
for i in range(N):
    try:
        if y_train[i] == 0:
            count_matrix[0] += cv_matrix[i]
            y_count_0 += 1
        else:
            count_matrix[1] += cv_matrix[i]
            y_count_1 += 1
    except:
        pass 

In [12]:
prob_matrix = np.zeros((2, N))
prob_matrix[0] = count_matrix[0]/(y_count_0 + N)
prob_matrix[1] = count_matrix[1]/(y_count_1 + N)

#prob_matrix[:, 345:350]

In [13]:
test_size = y_test.shape[0]
y_actual = y_test.reset_index(drop=True)
y_pred = np.zeros(test_size)
ratio = 1         # P(Y = 0|X = x)/# P(Y = 1|X = x)

for n in range(test_size):
    for index in range(len(x_test[n])):                                     
        if x_test[n, index]>0:                                  # if the token has appearred > 0 times
            ratio = prob_matrix[0, index]/prob_matrix[1, index]
        else:
            pass
    # print(ratio)
    if ratio>1:
        y_pred[n] = 0                             #If P(Y = 0|X = x) > P(Y = 1|X = x), predicted class will be Y = 1
    else:
        y_pred[n] = 1
        
count = 0
for n in range(test_size):
    if y_actual[n] == y_pred[n]:
        count += 1
accuracy = count/test_size
print("Number of correctly labeled points out of a total %d points : %d" % (test_size, count))
print(f"Accuracy: {accuracy}")

Number of correctly labeled points out of a total 400 points : 221
Accuracy: 0.5525


In [14]:
# using library
from sklearn.naive_bayes import MultinomialNB 
mnb = MultinomialNB()
y_pred = mnb.fit(x_train, y_train).predict(x_test)
print("Number of mislabeled points out of a total %d points : %d" % (x_test.shape[0], (y_test != y_pred).sum()))
accuracy = 1 - (y_test != y_pred).sum()/x_test.shape[0]
print("Accuracy: %f" % accuracy)

Number of mislabeled points out of a total 400 points : 77
Accuracy: 0.807500


In [16]:
# Using Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
y_pred_logistic = logreg.predict(x_test)

In [17]:
test_size = y_test.shape[0]
y_actual = y_test.reset_index(drop=True)
count = 0

for n in range(test_size):
    if y_actual[n] == y_pred_logistic[n]:
        count += 1
accuracy = count/test_size
print("Number of correctly labeled points out of a total %d points : %d" % (test_size, count))
print(f"Accuracy: {accuracy}")

Number of correctly labeled points out of a total 400 points : 328
Accuracy: 0.82
