In [347]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/somyadahiaya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [348]:
df =pd.read_csv("reviews_100.csv")
# df =pd.read_csv("IMDB Dataset.csv")

In [349]:
print(df['sentiment'].value_counts())

negative    50
positive    50
Name: sentiment, dtype: int64


In [350]:
stop_words = set(stopwords.words('english'))

def cleanup(text):
    text =re.sub(r'<[^>]+>', '', text)
    text= [word.lower() for word in text.split() if word.lower() not in stop_words]
    text=' '.join(text)
    return text

print("The movie was <br>BLOCKBUSTER!!!! <br>")
print(cleanup("The movie was BLOCKBUSTER!!!!"))

The movie was <br>BLOCKBUSTER!!!! <br>
movie blockbuster!!!!


In [351]:
df['cleaned']=df['review'].apply(cleanup)

In [352]:
vectorizer = TfidfVectorizer(max_features=300,ngram_range=(1,2))

In [353]:
vectorised= vectorizer.fit_transform(df['cleaned'])

In [354]:
print(vectorised)

  (0, 243)	0.15274579000749217
  (0, 256)	0.13177124016929537
  (0, 158)	0.10528435385239325
  (0, 214)	0.14310570721999583
  (0, 171)	0.060932263635392774
  (0, 257)	0.08430980401419645
  (0, 245)	0.13177124016929537
  (0, 178)	0.2105687077047865
  (0, 102)	0.13522918606987727
  (0, 254)	0.11316087500251182
  (0, 133)	0.13898818727199122
  (0, 46)	0.13522918606987727
  (0, 100)	0.12856968232651733
  (0, 139)	0.25713936465303466
  (0, 143)	0.11537680999784197
  (0, 190)	0.14310570721999583
  (0, 40)	0.10528435385239325
  (0, 126)	0.16517401828736128
  (0, 280)	0.09862485010903331
  (0, 89)	0.13898818727199122
  (0, 222)	0.15274579000749217
  (0, 25)	0.23542516426452476
  (0, 72)	0.10182640795181135
  (0, 282)	0.3461010788749597
  (0, 31)	0.23075361999568395
  :	:
  (98, 143)	0.08589061585891852
  (98, 25)	0.08762944801362293
  (98, 72)	0.07580321288000716
  (98, 141)	0.10066943325837567
  (98, 87)	0.14458260483197002
  (99, 263)	0.3512088091878791
  (99, 35)	0.648465467024084
  (99, 16

In [355]:
X=vectorised.toarray()

In [356]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
y=df['label']
print(X.shape)


(100, 300)


In [357]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=0.2, random_state=42)

In [358]:
model= LogisticRegression(class_weight='balanced')
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [359]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_pred))

print(classification_report(y_test,y_pred))

0.7
              precision    recall  f1-score   support

           0       0.64      0.90      0.75        10
           1       0.83      0.50      0.62        10

    accuracy                           0.70        20
   macro avg       0.74      0.70      0.69        20
weighted avg       0.74      0.70      0.69        20



In [360]:
def text_preprocess_vectorize(texts, vectorizer):
    cleaned=[]
    for text in texts:
        cleaned.append(cleanup(text))
    vectorised=vectorizer.transform(cleaned)
    return vectorised

In [361]:
sample=text_preprocess_vectorize(df['review'],vectorizer)
results=model.predict(sample.toarray())

df['testing']=results

print(df[['label','testing']])

    label  testing
0       0        0
1       0        0
2       0        0
3       1        1
4       1        1
..    ...      ...
95      0        0
96      0        0
97      1        1
98      0        0
99      0        1

[100 rows x 2 columns]


In [362]:
review_texts = ["An absolutely stunning movie with a powerful message!","Absolutely amazing work, good movie.", "Worst movie ever, not recommended", "good movie , thriller and intersting to watch with friends." ]

sample=text_preprocess_vectorize(review_texts,vectorizer)



results=model.predict(sample.toarray())

for i in  range(0,len(review_texts)):
    print(review_texts[i],"---->",results[i])

An absolutely stunning movie with a powerful message! ----> 1
Absolutely amazing work, good movie. ----> 1
Worst movie ever, not recommended ----> 0
good movie , thriller and intersting to watch with friends. ----> 1


When we use 300 features and dataset size of 100 , the accuracy is 0.7, while if we increase the dataset size to the actual data, the accuracy becomes 0.81. I have used ngram in my tfidf vectoriser to also take the account of word combinations such as "not good" or "very good".