In [26]:
# importing the libraires
import re
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer

In [2]:
# postive sentiment

pos_rev = pd.read_csv('netflix/pos.txt' , sep='\n' ,encoding='latin-1', header = None)
pos_rev['mood'] = 1
# renaming the column
pos_rev.rename(columns = {0:'review'}, inplace = True)

In [3]:
# negative sentiment

neg_rev = pd.read_csv('netflix/negative.txt' , sep='\n' ,encoding='latin-1', header = None)
neg_rev['mood'] = 0
# renaming the column
neg_rev.rename(columns = {0:'review'}, inplace = True)

In [4]:
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [5]:
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


# pipeline

1. lower cases
2. tokenization
3. remove stopwords
4. remove punct
5. lemma / stem
6. bow/tfid
7. train test split
8. Naiave bayes , Svm
9. evaluate model
10. saving the model
11. testing it

In [16]:
lemma = WordNetLemmatizer()
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : x.lower())
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : re.sub(r'@\S+' , "" , x))
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))
pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : " ".join([lemma.lemmatize(word , 'v') for word in nltk.word_tokenize(x) if word not in stopwords.words('english')]))

In [17]:
lemma = WordNetLemmatizer()
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : x.lower())
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : re.sub(r'@\S+' , "" , x))
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))
neg_rev.loc[: , 'review'] = neg_rev.loc[: , 'review'].apply(lambda x : " ".join([lemma.lemmatize(word , 'v') for word in nltk.word_tokenize(x) if word not in stopwords.words('english')]))

In [18]:
pos_rev

Unnamed: 0,review,mood
0,rock destine 21st century 's new `` conan `` '...,1
1,gorgeously elaborate continuation `` lord ring...,1
2,effective too-tepid biopic,1
3,sometimes like go movies fun wasabi good place...,1
4,emerge something rare issue movie 's honest ke...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family 's joyous life act yiddi...,1
5328,stand shadow motown best kind documentary one ...,1
5329,'s nice see piscopo years chaykin headly price...,1


In [19]:
neg_rev

Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,'s laddish juvenile teenage boys could possibl...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people nevertheless find move,0
5327,many definitions 'time waster movie must surel...,0
5328,stand crocodile hunter hurry badly cobble look...,0
5329,thing look like made-for-home-video quickie,0


In [20]:
# common dataset
com_rev = pd.concat([pos_rev , neg_rev],axis = 0).reset_index()
com_rev

Unnamed: 0,index,review,mood
0,0,rock destine 21st century 's new `` conan `` '...,1
1,1,gorgeously elaborate continuation `` lord ring...,1
2,2,effective too-tepid biopic,1
3,3,sometimes like go movies fun wasabi good place...,1
4,4,emerge something rare issue movie 's honest ke...,1
...,...,...,...
10657,5326,terrible movie people nevertheless find move,0
10658,5327,many definitions 'time waster movie must surel...,0
10659,5328,stand crocodile hunter hurry badly cobble look...,0
10660,5329,thing look like made-for-home-video quickie,0


In [21]:
# train test split
X_train , X_test , y_train , y_test = train_test_split(com_rev['review'].values , com_rev['mood'].values , test_size = 0.2, random_state = 101)

In [22]:
train_data = pd.DataFrame({'review':X_train , 'mood':y_train})
test_data = pd.DataFrame({'review':X_test , 'mood':y_test})

In [23]:
train_data

Unnamed: 0,review,mood
0,put washington honest work man john q archibal...,0
1,poignant familiar story young person suspend t...,1
2,timely director could ever dream quietly lyric...,1
3,film virtually choke self-consciousness,0
4,film take inside rhythms subject experience watch,1
...,...,...
8524,branagh forceful non-shakespeare screen perfor...,1
8525,movie friday fan critics damn already like sor...,0
8526,perhaps heaviest joyless movie ever make giant...,0
8527,film rival live fine little amuse-bouche keep ...,1


In [24]:
test_data

Unnamed: 0,review,mood
0,important movie reminder power film move us ma...,1
1,'ve never see hear anything quite like film re...,1
2,end leave unfulfilled performances enjoy memor...,1
3,surface 's lovers-on-the-run crime flick lot c...,1
4,walk remember shrewd enough activate girlish t...,0
...,...,...
2128,bullock good job work natural likability,1
2129,result memorable least interest,1
2130,apparently design reverie memory regret thing ...,0
2131,movie insecure capacity excite churn one two f...,0


In [28]:
vectorizer = TfidfVectorizer()
train_vector = vectorizer.fit_transform(train_data['review'])
test_vector = vectorizer.transform(test_data['review'])

In [30]:
train_vector.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
# so see the vocbulary

vectorizer.get_feature_names()

['00',
 '000',
 '007',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '10th',
 '11',
 '110',
 '112',
 '11th',
 '12',
 '127',
 '129',
 '12th',
 '13',
 '133',
 '13th',
 '14',
 '140',
 '146',
 '15',
 '15th',
 '16',
 '163',
 '168',
 '17',
 '170',
 '179',
 '18',
 '180',
 '1899',
 '18th',
 '19',
 '1915',
 '1930s',
 '1934',
 '1937',
 '1938',
 '1940s',
 '1949',
 '1950',
 '1950s',
 '1952',
 '1953',
 '1954',
 '1955',
 '1958',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1967',
 '1970s',
 '1971',
 '1972',
 '1975',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1982',
 '1984',
 '1986',
 '1987',
 '1990',
 '1992',
 '1993',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '19th',
 '20',
 '2000',
 '2002',
 '20th',
 '21',
 '21st',
 '22',
 '24',
 '2455',
 '25',
 '2525',
 '26',
 '270',
 '28k',
 '30',
 '300',
 '3000',
 '30s',
 '33',
 '37',
 '3d',
 '40',
 '400',
 '40s',
 '45',
 '451',
 '48',
 '4ever',
 '50',
 '500',
 '50s',
 '51',
 '51st',
 '52',
 '53',
 '5ths',
 '60',
 '60s',
 '65',
 '65th',
 '66',
 '70',
 '70s',

In [36]:
# using svm
from sklearn import svm
from sklearn.metrics import classification_report,accuracy_score

In [34]:
classifier = svm.SVC(kernel = 'linear')
classifier.fit(train_vector, train_data['mood'])

SVC(kernel='linear')

In [35]:
pred = classifier.predict(test_vector)

In [37]:
accuracy_score(pred , test_data['mood'])

0.7473042662916081

In [39]:
import joblib
joblib.dump(classifier , 'classifier_74.pkl')
joblib.dump(vectorizer , 'transform.pkl')

['transform.pkl']

In [47]:
# flask
model = joblib.load('classifier_74.pkl')
vector = joblib.load('transform.pkl')

review = input('please give me the review:- ')
open ('retraining.csv' , a):
    
tfidf = vector.transform([review]).toarray()
my_pred = model.predict(tfidf)
if my_pred == 1:
    print('postive review')
else:
    print('negative review')

please give me the review:- not a good movie
negative review


In [None]:
# task
regex to remove the digits
use spact stopword to remove the stopword
use naive bayes and try to compare the accuracy

# flask model

# twitter data - train the model

In [None]:
phase 1: done
text mining
training

phase 2
cosine similarity - project
informatrion retrival - poject


phase 3 - chatbot

1. undestanding
2. building
3. integrating it in telegram