In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("Movie_Compiled.csv")
data.head()


Unnamed: 0,Rating,Date,Title,Review
0,,12 April 2012,An Unstoppable Force of Awesomeness! All Hail ...,I have got to tell anyone who will listen that...
1,,19 April 2012,They balanced everything masterfully. As Good ...,"If you're a fan of epic adventure movies, then..."
2,10.0,20 April 2012,The Avengers assembled flawlessly,I'm sorry to say The Avengers isn't a good mov...
3,9.0,16 April 2012,LA Review of 'The Avengers' (No spoilers),LA Review of 'The Avengers' (No spoilers)\r\n\...
4,10.0,2 July 2017,Bad-ass descent-amazing superhero MCU action f...,Marvel's The Avengers (2012) is an awesome des...


In [3]:
data.shape

(4502, 4)

In [4]:
from sklearn.decomposition import TruncatedSVD # it will not get the full matrix of left and right singular matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import cohen_kappa_score

In [6]:
data_1 = data[~data["Review"].isnull()]
data_1.shape

(4402, 4)

In [8]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4402 entries, 0 to 4501
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rating  4075 non-null   object
 1   Date    4402 non-null   object
 2   Title   4402 non-null   object
 3   Review  4402 non-null   object
dtypes: object(4)
memory usage: 172.0+ KB


In [10]:
data_1.Rating.value_counts()

10.0    1143
1.0      719
9.0      428
8.0      301
7.0      288
6.0      252
5.0      232
3.0      225
2.0      218
4.0      202
          67
Name: Rating, dtype: int64

In [14]:
import numpy as np
data_2 = data_1.replace(" ",np.nan)
data_2.head()

Unnamed: 0,Rating,Date,Title,Review
0,,12 April 2012,An Unstoppable Force of Awesomeness! All Hail ...,I have got to tell anyone who will listen that...
1,,19 April 2012,They balanced everything masterfully. As Good ...,"If you're a fan of epic adventure movies, then..."
2,10.0,20 April 2012,The Avengers assembled flawlessly,I'm sorry to say The Avengers isn't a good mov...
3,9.0,16 April 2012,LA Review of 'The Avengers' (No spoilers),LA Review of 'The Avengers' (No spoilers)\r\n\...
4,10.0,2 July 2017,Bad-ass descent-amazing superhero MCU action f...,Marvel's The Avengers (2012) is an awesome des...


In [15]:
data_2["Rating"] = data_2["Rating"].astype(float)
data_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4402 entries, 0 to 4501
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rating  4008 non-null   float64
 1   Date    4402 non-null   object 
 2   Title   4402 non-null   object 
 3   Review  4402 non-null   object 
dtypes: float64(1), object(3)
memory usage: 172.0+ KB


In [16]:
train_data = data_2[~data_2["Rating"].isnull()]
test_data = data_2[data_2["Rating"].isnull()]

In [17]:
(train_data.shape, test_data.shape)

((4008, 4), (394, 4))

In [19]:
x_train, x_val , y_train, y_val = train_test_split(train_data.iloc[:,1:],
                                                   train_data.iloc[:,0],
                                                   test_size = 0.4,
                                                   random_state = 123456)
                                                   

In [20]:
x_train.shape

(2404, 3)

In [21]:
from wordcloud import STOPWORDS

In [22]:
tfidf = TfidfVectorizer(stop_words=STOPWORDS, 
                        token_pattern="[a-z']+",
                        max_df=0.55,
                        min_df=10)

In [23]:
tfidf_mat_train = tfidf.fit_transform(x_train["Review"])
tfidf_mat_val = tfidf.transform(x_val["Review"])
tfidf_mat_test = tfidf.transform(test_data["Review"])

In [24]:
tfidf_mat_train.shape #no of columns are more than no of rows

(2404, 3657)

In [27]:
lr_model_1 = LogisticRegression() #assumption in high dimension, classes are linearly separable
lr_model_1.fit(X=tfidf_mat_train,y=y_train)

LogisticRegression()

In [28]:
pred_val_lr = lr_model_1.predict(tfidf_mat_val)
cohen_kappa_score(pred_val_lr,y_val) #because of sparsity, in higher dim it couldnot do the justice

0.2431569431210462

In [34]:
N=150
svd = TruncatedSVD(n_components=N) #150 is less than 3657
svd_mat_train = svd.fit_transform(tfidf_mat_train)
svd_mat_val = svd.transform(tfidf_mat_val)
svd_mat_test = svd.transform(tfidf_mat_test)

In [35]:
lr_model_2 = LogisticRegression()
lr_model_2.fit(X=svd_mat_train,y = y_train)
pred_val_lr2 = lr_model_2.predict(svd_mat_val)
cohen_kappa_score(pred_val_lr2,y_val)

0.2482873644019904

In [32]:
gbm = GradientBoostingClassifier(learning_rate=0.01,n_estimators=2000,max_depth =1)

In [36]:
gbm_1 = gbm.fit(X=svd_mat_train,y = y_train)
pred_val_gbm1 = gbm_1.predict(svd_mat_val)
cohen_kappa_score(pred_val_gbm1,y_val)

0.24242849739213013

In [37]:
svd_new = TruncatedSVD(n_components=150)

In [38]:
word_vec = svd_new.fit_transform(tfidf_mat_train.T)

In [39]:
word_vec.shape

(3657, 150)

In [41]:
tfidf.vocabulary_["hulk"] #giving the index

1562

In [42]:
from sklearn.metrics import pairwise_distances

In [55]:
dist_hulk = pairwise_distances(X=word_vec,
                               Y = word_vec[1562,:].reshape(1,-1),
                               metric ="cosine")

In [48]:
dist_hulk.flatten().argsort()

array([1562, 3275, 1699, ..., 1121, 2211,  957])

In [50]:
id_to_word = { v:k for k,v in tfidf.vocabulary_.items()}

In [51]:
id_to_word[3274]

'thirty'

In [52]:
id_to_word[1699]

'iron'

In [53]:
id_to_word[1562]

'hulk'