In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
traindf=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv")

# Tour of data

lets take a quick look at the training data

In [None]:
traindf.shape

In [None]:
traindf.head()

In [None]:
traindf.describe()

In [None]:
plt.bar(["POSITIVE","NEGATIVE"],traindf["sentiment"].value_counts())

In [None]:
plt.bar(["False","True"],traindf["isFrequentReviewer"].value_counts())

In [None]:
traindf["reviewText"].describe()

# lets test the hypothesis i have.
Nullhypo- isFrequentReviewer and sentiment are independent.

In [None]:
pd.crosstab(traindf['isFrequentReviewer'],traindf['sentiment'])

In [None]:
from scipy.stats import chi2_contingency
chi2_contingency(pd.crosstab(traindf['isFrequentReviewer'],traindf['sentiment']))

p<0.05, null hypothesis is wrong, so there is some relation between isFrequentReviewer and sentiment

In [None]:
traindf["reviewText"].value_counts()

somereview only has text as Parental Content Review . so only review text is not enough to distinguish the review as positive or negative.

In [None]:
traindf.info()

no feature is numerical. we have to do some preprocessing. 

In [None]:
traindf.isnull().sum()

#luckily there are no null values except reviewText 😊

In [None]:
traindf["sentiment"].value_counts()

In [None]:
traindf["movieid"].unique().shape

In [None]:
moviesdf=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv")

In [None]:
moviesdf.head()

In [None]:
moviesdf.shape

In [None]:
moviesdf.info()

In [None]:
moviesdf.isnull().sum()

In [None]:
moviesdf.corr()

In [None]:
import seaborn as sns
sns.heatmap(moviesdf.corr(),annot=True)

In [None]:
sns.boxplot(moviesdf["runtimeMinutes"])

from the above only some features can be used.

In [None]:
sns.boxplot(moviesdf["audienceScore"])

In [None]:
moviesdf['movieid'].unique().shape

16812<<<126404 #movies.csv contains info about all the movies in traindf and more


In [None]:
moviesdf.describe(include='all')

Lets convet the text(reviews) to numbers so that ml model can work on it to do prediction (classification) (positive|negative)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tokenizer=TfidfVectorizer(ngram_range=(1,2))
# tokenizer=TfidfVectorizer()
traindf["reviewText"].replace(np.nan,"",inplace=True)
val=tokenizer.fit_transform(traindf["reviewText"])

i love ml \
ngram eg- i love : love ml

In [None]:
mdf=moviesdf.groupby("movieid").agg({"audienceScore":lambda x: np.max(x.dropna())})
changed=mdf.reset_index()
changed["audienceScore"].replace(np.nan,np.mean(changed["audienceScore"]),inplace=True)

In [None]:
mdf1=moviesdf.groupby("movieid").agg({"runtimeMinutes":lambda x: (np.mean(x.dropna()))})
changed1=mdf1.reset_index()
changed1["runtimeMinutes"].replace(np.nan,changed1["runtimeMinutes"].mean(),inplace=True)
changed1["runtimeMinutes"]=changed1["runtimeMinutes"].apply(lambda x: int(x))

there can many audienceScore for one movies, thats why we are grouping by movieid and taking max.

now lets join the data 😊

In [None]:
og=traindf.merge(changed,how='left',on="movieid")
og=og.merge(changed1,how='left',on="movieid")
og["isFrequentReviewer"]=og["isFrequentReviewer"].apply(lambda x: 0 if x==False else 1)
import scipy.sparse as sparse
finalval=sparse.hstack((og["audienceScore"][:,None],val))
finalval=sparse.hstack((og["isFrequentReviewer"][:,None],finalval))
# finalval=sparse.hstack((og["runtimeMinutes"][:,None],finalval))

since tfid give a sparse value ,there the training data will also be sparse\.  **Disclaimer** not expand the sparse matrix untill u have loads of **RAM**.

In [None]:
from sklearn.preprocessing import MaxAbsScaler
minmax=MaxAbsScaler()
finalval2=minmax.fit_transform(finalval)

scale down the data for faster proceessing. The reason for choosing MaxAbsScaler is because it can handle sparse data\
**lets start training the model**

# Model Training

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(finalval2,traindf["sentiment"],random_state=64,test_size=0.25)

In [None]:
Y_train2=Y_train.apply(lambda x : 1 if x=="POSITIVE" else 0)
Y_test2=Y_test.apply(lambda x : 1 if x=="POSITIVE" else 0) 

model 1 logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
lrmodel=LogisticRegression(C=10, max_iter=2500,solver='saga',
                   tol=0.01)


Hyperparameter tuning for lrmodel

In [None]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {'C': [0.1, 1,5,10],
#               'penalty': [l2',"elasticnet"],
#               'solver': [saga', 'liblinear', 'sag'],
#               'max_iter': [500,800,1000,2500],
#               'tol': [1e-4, 1e-3, 1e-2]}
# grid_search = GridSearchCV(lrmodel, param_grid, cv=3)
# grid_search.fit(X_train, Y_train2)
# grid_search.best_estimator_

#took almost 2hrs to run. 

In [None]:
lrmodel.fit(X_train,Y_train2)

In [None]:
lrmodel.score(X_test,Y_test2)

In [None]:
y_pred=lrmodel.predict(X_test)
from sklearn.metrics import f1_score
f1_score(Y_test2,y_pred,average="macro")

In [None]:
pred_prob1 = lrmodel.predict_proba(X_test)
from sklearn.metrics import roc_curve
fpr1, tpr1, thresh1 = roc_curve(Y_test2, pred_prob1[:,1])
plt.plot(fpr1, tpr1, linestyle='-.',color='orange', label='Logistic Regression')
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')



In [None]:
from sklearn.metrics import roc_auc_score
auc_score1 = roc_auc_score(Y_test2, pred_prob1[:,1])
print(auc_score1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test2,y_pred))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(Y_test2, y_pred)

In [None]:
lrmodel.coef_

In [None]:
lrmodel.intercept_

we have to find some way to improve the model in detecting negative reviews.

model2 XGBClassifier

In [None]:
from xgboost import XGBClassifier
xgmodel=XGBClassifier(n_estimators =500,learning_rate=0.6,colsample_bytree=0.8,max_depth=20)

hyperparameter tuning for XGBClassifier**

In [None]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {
#     learning_rate': [ 0.3,0.5,0.7],
#     'max_depth': [10,20,30],
#     'n_estimators': [300,500],
#     'colsample_bytree': [0.8, 1.0],
# }
# grid_search = GridSearchCV(xgmodel, param_grid, cv=3, scoring='f1')
# grid_search.fit(X_train, Y_train2)
# best_params = grid_search.best_params_

In [None]:
xgmodel.fit(X_train,Y_train2)
xgmodel.score(X_test,Y_test2)
#takes 10 minutes to run

In [None]:
xgmodel.feature_importances_

In [None]:
y_pred_xg=xgmodel.predict(X_test)
from sklearn.metrics import f1_score
f1_score(Y_test2,y_pred_xg,average="macro")

In [None]:
pred_prob2 = xgmodel.predict_proba(X_test)
from sklearn.metrics import roc_curve
fpr2, tpr2, thresh2 = roc_curve(Y_test2, pred_prob2[:,1])
plt.plot(fpr2, tpr2, linestyle='-.',color='orange', label='XGB')
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')



In [None]:
auc_score2 = roc_auc_score(Y_test2, pred_prob2[:,1])
print(auc_score2)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test2,y_pred_xg))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(Y_test2, y_pred_xg)

model 3 SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgmodel=SGDClassifier(loss="squared_hinge",random_state=10,warm_start=True,max_iter=7000,eta0=2,learning_rate="invscaling",shuffle=True,n_jobs=-1,tol=0.0009,class_weight='balanced')
sgmodel.fit(X_train,Y_train2)
sgmodel.score(X_test,Y_test2)

In [None]:
# param_grid = {
#     'loss': ['hinge', 'log', 'perceptron'],
#     'penalty': ['l2','elasticnet'],
#     'alpha': [0.001, 0.01],
#     'learning_rate': ['optimal', 'invscaling'],
#     'max_iter': [3000,4000,5000],
#     'tol': [1e-3, 1e-4, 1e-5]
# }
# grid_search = GridSearchCV(
#     estimator=sgmodel,
#     param_grid=param_grid,
#     scoring='f1',  
#     cv=3
# )
# grid_search.fit(X_train, y_train)
# best_params = grid_search.best_params_

In [None]:
y_pred_sg=sgmodel.predict(X_test)
from sklearn.metrics import f1_score
f1_score(Y_test2,y_pred_sg,average="macro")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test2,y_pred_sg))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(Y_test2, y_pred_sg)

In [None]:
sgmodel.coef_

In [None]:
sgmodel.intercept_

In [None]:
# pred_prob3 = sgmodel.predict_proba(X_test)
# fpr3, tpr3, thresh3 = roc_curve(Y_test3, pred_prob3[:,1])
# plt.plot(fpr3, tpr3, linestyle='-.',color='orange', label='SGD')
# plt.title('ROC curve')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive rate')


# **Submission to kaggle**

In [None]:
testd=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv")
testd["reviewText"].replace(np.nan,'',inplace=True)
snew=tokenizer.transform(testd["reviewText"])
# testvalue2=coder2.transform(testd["movieid"])
testog=testd.merge(changed,how='left',on="movieid")
testog["isTopCritic"]=testog["isTopCritic"].apply(lambda x: 0 if x==False else 1)
import scipy.sparse as sparse
snew1=sparse.hstack((testog["audienceScore"][:,None],snew))
snew1=sparse.hstack((testog["isTopCritic"][:,None],snew1))
# snew1=sparse.hstack((testvalue2[:,None],snew1))
snew2=minmax.fit_transform(snew1)
ypred=lrmodel.predict(snew2)

In [None]:
yact=[]
for i in ypred:
    if i==1:
        yact.append("POSITIVE")
    else:
        yact.append("NEGATIVE")

In [None]:
res=pd.DataFrame()
res["id"]=range(testd.shape[0])
res["sentiment"]=yact
res.to_csv("submission.csv",index=False)

done..//
