In [1]:
import numpy as np
import nltk
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

### DECLARING LEMMATIZER

In [27]:
word_lemmatizer = WordNetLemmatizer()
stop_words = set(open('electronics/stopwords.txt'))

### READING POSITIVE AND NEGATIVE REVIEWS

In [28]:
positive_reviews = BeautifulSoup(open('electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

### DEFINING TOKENIZER WHICH CONVERTS STRINGS TO LIST OF WORDS WITH FILTERING OF SMALL WORDS DONE AND LEMMATIZATION ALSO DONE

In [29]:
def tokenizer(s):
    s = s.lower()     # convert the string to lower case
    tokens = nltk.tokenize.word_tokenize(s) # make tokens ['dogs', 'the', 'plural', 'for', 'dog']
    tokens = [t for t in tokens if len(t)>2] #remove words having length less than 2
    tokens = [word_lemmatizer.lemmatize(t) for t in tokens] # lemmatize the words means making different words of same meaning one word like dogs get converted to dog ['dog', 'the', 'plural', 'for', 'dog']
    tokens = [t for t in tokens if t not in stop_words] # remove stop words like is,and,this,that etc.
    return tokens

### TESTING TOKENIZER

In [30]:
tokenizer('dogs is the plural for dog')

['dog', 'the', 'plural', 'for', 'dog']

 ### CREATING WORD_2_INT DICTIONARY WHICH CONATINS ALL THE WORDS AND INDICES AGAINST THEM ACTING AS A UNIQUE KEY FOR THAT WORD

In [6]:
word_2_int = {}
ind = 0
positive_tokens = []
negative_tokens = []

for review in positive_reviews:
    tokens = tokenizer(review.text)
    positive_tokens.append(tokens)
    for t in tokens:
        if t not in word_2_int:
            word_2_int[t] = ind
            ind+=1
            
for review in negative_reviews:
    tokens = tokenizer(review.text)
    negative_tokens.append(tokens)
    for t in tokens:
        if t not in word_2_int:
            word_2_int[t] = ind
            ind+=1

### converting reviews to vectors where each index will depict the count of that word in that review and itslength will be equal to the length of the word_2_int dictionary

In [31]:
def tokens_2_vectors(token,label=None):
    X = np.zeros(len(word_2_int)+1)
    for t in token:
        index = word_2_int[t]
        X[index]+=1
    X = X/X.sum()
    X[-1] = label
    return X

### stacking reviews upon each other in final matrix to build final data(first all positive reviews and then negative reviews) 

In [32]:
total_reviews = len(positive_tokens) + len(negative_tokens)
final_matrix = np.zeros((total_reviews , len(word_2_int)+1))
row = 0

for t in positive_tokens:
    final_matrix[row,:] = tokens_2_vectors(t,1)
    row+=1
    
for t in negative_tokens:
    final_matrix[row,:] = tokens_2_vectors(t,0)
    row+=1

np.random.shuffle(final_matrix)

### splitting the data

In [33]:
X = final_matrix[:,:-1]
y = final_matrix[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [35]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
print('training accuracy is --> ',lr.score(X_train,y_train)*100)
print('test accuracy is --> ',lr.score(X_test,y_test)*100)

training accuracy is -->  77.07142857142857
test accuracy is -->  72.83333333333334




### just checking the weights of the words and their sentiment according to it

In [36]:
threshold = 0.5
for word,index in word_2_int.items():
    weight = lr.coef_[0][index]
    if weight>threshold or weight<-threshold:
        print(word,' : ',weight)

and  :  1.346808441780291
for  :  1.6150097083838146
that  :  -0.6434551112593336
are  :  0.6770144147621627
the  :  -0.8057102173953095
used  :  0.5693761835464781
they  :  -0.5699297475678357
good  :  1.2099850047922276
you  :  0.6617790394504701
n't  :  -1.0904400174148612
easy  :  0.5789052371772719
use  :  0.7688082832962273
quality  :  0.7517284661721622
best  :  0.5374601511182819
very  :  0.7268813035909097
with  :  1.000468815174388
out  :  -0.7724600813925767
price  :  1.2182803269712776
great  :  1.997321794142929
after  :  -0.9933560584163205
worked  :  -0.5098441748584197
not  :  -2.520758888992343
excellent  :  0.5477471910493635
back  :  -0.7754594309862741


### live predictor 

In [25]:
my_review = 'this product is very good !!'
vector = tokens_2_vectors(tokenizer(my_review))
vector = vector[:-1]
if rfc.predict([vector])[0]==1:
    print('positive review')
else:
    print('negative review')


positive review


### I TRIED USING RANDOM FOREST BUT IT WAS COMPLETELY OVERFITTING GIVING 100% TRAINING ACCURACY AND VERY LOW TEST ACCURACY ....

### I ALSO TRIED SVM BUT THAT ALSO PERFORMED REALLY BAD !!!