In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
imdb = pd.read_csv('imdb_labelled.txt',sep='\t',header=None)

In [3]:
imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
amazon = pd.read_csv('amazon_cells_labelled.txt',sep='\t',header=None)
yelp = pd.read_csv('yelp_labelled.txt',sep='\t',header=None)

In [5]:
imdb.shape

(748, 2)

In [6]:
df = pd.DataFrame()

In [7]:
df = df.append(imdb)
df = df.append(amazon)
df = df.append(yelp)

In [8]:
df.shape

(2748, 2)

In [9]:
df.columns = ["Review","Sentiment"]

In [10]:
df.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [11]:
def textProcessing(documents):
    tokens = []
    for i in range(len(documents)):
        tokens.append(word_tokenize(documents['Review'].iloc[i].lower()))
    
    eng_stopwords = stopwords.words("english")
    eng_stopwords.extend([',','.','!','@','#','?','-'])
    
    main_words = []
    for i in range(len(tokens)):
        words = []
        for token in tokens[i]:
            if token not in eng_stopwords:
                words.append(token)
        main_words.append(words)
        
    wnet = WordNetLemmatizer()
    
    for i in range(len(main_words)):
        for j in range(len(main_words[i])):
            main_words[i][j] = wnet.lemmatize(main_words[i][j],pos='v')
    
    for i in range(len(main_words)):
        main_words[i] = " ".join(main_words[i])
        
    return main_words

In [12]:
wordsList = textProcessing(df)

In [13]:
wordsList[2]

'attempt artiness black & white clever camera angle movie disappoint become even ridiculous act poor plot line almost non-existent'

In [14]:
tfidf = TfidfVectorizer()

In [15]:
vector = tfidf.fit_transform(wordsList).toarray()

In [16]:
vector[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [17]:
vector.shape

(2748, 4372)

In [18]:
x_train,x_test,y_train,y_test = train_test_split(vector,df['Sentiment'])

In [19]:
logistic = LogisticRegression()
logistic.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
y_pred = logistic.predict(x_test)

In [21]:
accuracy_score(y_test,y_pred)

0.8034934497816594

In [22]:
confusion_matrix(y_test,y_pred)

array([[278,  54],
       [ 81, 274]], dtype=int64)

In [23]:
new_review = "I don't get why they couldn't hire someone with dark skin to play a dark-skinned person's role. It completely takes you out of the movie when Latina's skin colour is inconsistent through out the movie. I have hundreds more complaints about this movie but it is just annoying me to think about it. Don't see this extremely lame film. The Bala character is such a vain person that it is extremely difficult to connect with him at any level. The man only cares about his looks and nothing else. In fact the man has no identity in his film except his bald head. Every character in this movie is one dimensional and defined only by their looks or something that will serve a completely useless purpose in the movie. Bala is his bald head, Latika is her dark skin and Pari is Tik Tok. They want to tell you that looks are not important and yet they underestimate humanity. We are not in the age anymore where people look at bald people and find it funny. Nobody sees a person's wig come off and immediately starts guffawing. Nobody does that. Don't waste your time with this film."

In [24]:
test_df = pd.DataFrame({"Review":[new_review]})

In [25]:
test_words = textProcessing(test_df)

In [26]:
test_vector = tfidf.transform(test_words).toarray()

In [27]:
logistic.predict(test_vector)

array([0], dtype=int64)

In [28]:
nb = MultinomialNB()
nb.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
y_pred = nb.predict(x_test)

In [30]:
accuracy_score(y_test,y_pred)

0.8034934497816594

In [31]:
import pickle as pkl

In [32]:
nb.coef_

array([[-8.82072075, -6.68856905, -8.82072075, ..., -8.82072075,
        -8.77766568, -8.82072075]])

In [34]:
import pickle as pkl

with open('nb.pkl','wb') as file:
    pkl.dump(nb, file)

with open('tfidf.pkl','wb') as file:
    pkl.dump(tfidf, file)