In [78]:
# pandas for file handling and data frames
import pandas as pd
# for tokenization of text data
from nltk.tokenize import word_tokenize
# for removing stopwords from text
from nltk.corpus import stopwords
# convert word to stem or its first form
from nltk.stem import WordNetLemmatizer
# convert words to vectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# to divide the data into trainig and testing part
from sklearn.model_selection import train_test_split
# to implement machine learning we are using logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
# to check the accuracy score
from sklearn.metrics import accuracy_score, confusion_matrix

In [5]:
imdb = pd.read_csv('imdb_labelled.txt', sep='\t',header=None)

In [6]:
imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [7]:
imdb.columns = ['Review','Sentiment']

In [9]:
amazon = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)

In [10]:
amazon.columns = ['Review','Sentiment']

In [11]:
yelp = pd.read_csv('yelp_labelled.txt', sep='\t', header=None)

In [12]:
yelp.columns = ['Review','Sentiment']

In [14]:
df = pd.DataFrame()

In [15]:
df = df.append(imdb)

In [16]:
df = df.append(amazon)

In [17]:
df = df.append(yelp)

In [18]:
df.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [19]:
df.shape

(2748, 2)

In [86]:
def textProcessing(data):
    tokens = []
    for i in range(len(data)):
        tokens.append(word_tokenize(data['Review'].iloc[i].lower()))
    engStopwords = stopwords.words('english')
    engStopwords.extend(['.','?',"'s","also",",","-","!"])
    
    wordList = []
    for tokenList in tokens:
        t = []
        for token in tokenList:
            if token not in engStopwords:
                t.append(token)
        wordList.append(t)
        
    wnet = WordNetLemmatizer()
    
    for i in range(len(wordList)):
        for j in range(len(wordList[i])):
            wordList[i][j] = wnet.lemmatize(wordList[i][j], pos='v')
    
    for i in range(len(wordList)):
        wordList[i] = ' '.join(wordList[i])
    
    return wordList
    

In [87]:
wordList = textProcessing(df)

In [88]:
wordList[0]

'slow-moving aimless movie distress drift young man'

In [89]:
# cv = CountVectorizer()
# vect = cv.fit_transform(wordList)

tfidf = TfidfVectorizer()
vect = tfidf.fit_transform(wordList)

In [90]:
y = df['Sentiment']
x_train, x_test, y_train, y_test = train_test_split(vect,y,test_size=0.25)

In [91]:
x_train.shape

(2061, 4371)

In [92]:
x_test.shape

(687, 4371)

In [93]:
reg = LogisticRegression()

In [94]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [95]:
y_pred = reg.predict(x_test)

In [96]:
accuracy_score(y_test, y_pred)

0.7976710334788938

In [56]:
confusion_matrix(y_test, y_pred)

array([[279,  71],
       [ 69, 268]], dtype=int64)

In [64]:
review = "Today I thought that I should watch this movie so I booked tickets for the movie. The show was at 12:00 PM. When it was the time for intervals I was regretting my decision.Salman does not know how to act and he was annoying. I suggest you not to watch this movie and waste your money. I think it's enough for now Bollywood is intolerable"

In [73]:
test_df = pd.DataFrame({"Review":[review]})

In [74]:
test_df

Unnamed: 0,Review
0,Today I thought that I should watch this movie...


In [75]:
wordList = textProcessing(test_df)

In [76]:
matrix = tfidf.transform(wordList)

In [77]:
reg.predict(matrix)

array([0], dtype=int64)

In [70]:
test_df

Unnamed: 0,Review
0,Today I thought that I should watch this movie...


In [79]:
nb = MultinomialNB()

In [80]:
nb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [81]:
y_pred_2 = nb.predict(x_test)

In [82]:
accuracy_score(y_test, y_pred_2)

0.7903930131004366

In [97]:
import pickle

In [99]:
file = open('model.pkl','wb')
pickle.dump(reg,file)

In [100]:
file.close()

In [101]:
file = open('tfidf.pkl','wb')
pickle.dump(tfidf,file)
file.close()