In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import Required Libraries**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

from sklearn.decomposition import TruncatedSVD

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report

# **Load the dataset**

In [None]:
movie_data = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv")
train_data = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv")
test_data = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv")
sample_data = pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/sample.csv")

# **Explore the Dataset**

In [None]:
train_data.describe()

In [None]:
test_data.shape

# **Drop duplicates and merge data**

In [None]:
movie_data.drop_duplicates(subset=["movieid"], keep ="first", inplace=True)
train_data = pd.merge(train_data, movie_data, on = ['movieid'], how = 'left')
test_data = pd.merge(test_data,movie_data , on = ['movieid'], how = 'left')

# **Filling Missing Values in the Required Columns**

**1. Count nan values in train and test data**

In [None]:
train_data.isna().sum()
#test_data.isna().sum()

**2. Filling missing nan values**

In [None]:
#train data
train_data[['reviewText','rating','ratingContents','genre']]= train_data[['reviewText','rating','ratingContents','genre']].fillna("")
train_data[['audienceScore','runtimeMinutes']] = train_data[['audienceScore','runtimeMinutes']].fillna(0)

#test data
test_data[['reviewText','rating','ratingContents']]= test_data[['reviewText','rating','ratingContents']].fillna("")
test_data[['audienceScore','runtimeMinutes']] = test_data[['audienceScore','runtimeMinutes']].fillna(0)


**3. Again checking nan values in train and tes data**

In [None]:
#train_data.isna().sum()
test_data.isna().sum()

# **Visualisation And Comparision Of Data**

**Using Cross Tab To Compare Different Columns With Sentiments**

In [None]:
CrosstabResult=pd.crosstab(index=train_data['sentiment'],columns=train_data['runtimeMinutes'])
CrosstabResult.iloc[:,100:170]

**Using Scatter Plot To Compare Different Columns And Look For Sentiments**

In [None]:
sns.scatterplot(x = train_data['boxOffice'], y = train_data['sentiment'].astype(str), palette='viridis', hue = train_data['sentiment'], legend=None)
#plt.figure(figsize=(8,6))
#sns.pairplot(train_data,hue="sentiment")
#plt.show()
#sns.jointplot(x=train_data['sentiment'], y=train_data['rating'])

In [None]:
# create scatter plot for comparing different Features
plt.figure(figsize = (16,9))
kwargs  =   {'edgecolor':"w",'linewidth':2,'linestyle':':'}
sns.scatterplot(x = "boxOffice", y = "genre", data = train_data, hue = "sentiment", palette = "ocean_r",sizes = (200, 500), **kwargs)

# **Selecting Required Features For Training And Testing Model**

In [None]:
X = train_data[['title','audienceScore','runtimeMinutes','reviewText']]
y = train_data["sentiment"]
test_data = test_data[['title','audienceScore','runtimeMinutes','reviewText']]

#Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42)

# **Column Transformers For Different Models**

**Column transformer for sgdclassifier with onehotencoder and standard scaler**

In [None]:
#Columntransformer for logistic regression
ct = ColumnTransformer(transformers=[("ohe",OneHotEncoder(handle_unknown="ignore"),[0]),("stdscaler",StandardScaler(),[1,2]),("tfidf",TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features = 100000),"reviewText")],remainder='passthrough')

In [None]:
#Columntransformer for sgdclassifier
#ct = ColumnTransformer(transformers=[("ohe",OneHotEncoder(handle_unknown="ignore"),[0]),("stdscaler",StandardScaler(),[1,2]),("tfidf",TfidfVectorizer(stop_words='english',  ngram_range=(1, 3), lowercase=True, max_features = 100000),"reviewText")],remainder='passthrough')

In [None]:
#Columntransformer for multinomialnb
#ct = ColumnTransformer(transformers=[("ohe",OneHotEncoder(handle_unknown="ignore"),[0]),("scaler",MinMaxScaler(),[1,2]),("tfidf",TfidfVectorizer(stop_words='english', ngram_range=(1, 3), lowercase=True, max_features = 100000),"reviewText")])

# **Top Three Models**

In [None]:
#top three models for analysis
#Model 1
logrmodel = LogisticRegression(C = 2, solver='liblinear', random_state=42)
#Model 2
sgdclfmodel = SGDClassifier(loss = 'modified_huber',alpha = 0.0001, penalty='l2', random_state=42)
#Model 3
mnbmodel = MultinomialNB(alpha = 1, force_alpha = True, fit_prior = False)

**Using Pipeline**

In [None]:
#select model and update pipeline
pipeline = Pipeline(steps=[('column_transformer', ct), ('model',logrmodel)])

**GridSearchCV for sgdclassifier**

In [None]:
#GridSearchCV for sgdclassifier
#param_grid= {"model__loss":["hinge","modified_huber"], "model__alpha": [0.00001, 0.0001, 0.001],  "model__penalty" : ["l2"], "column_transformer__tfidf__ngram_range" : [(1,2), (1,3)]}
#grid_search = GridSearchCV(pipeline, param_grid , cv = 5)
#grid_search.fit(X_train, y_train)

**GridSearchCV for logistic regression**

In [None]:
#GridSearchCV for logistic regression
#param_grid= {'model__C' : [1, 2, 3, 4 ,5, 6, 10],  "model__penalty" : ["l2"], "column_transformer__tfidf__ngram_range" : [(1,2), (1,3)]}
#grid_search = GridSearchCV(pipeline, param_grid , cv = 3)
#grid_search.fit(X_train, y_train)

**GridSearchCV for multinomialNB**

In [None]:
#GridSearchCV for multinomialNB
#param_grid= {'model__alpha' : [1, 2, 3], "model__force_alpha" : [True], "model__fit_prior" : [True, False], "column_transformer__tfidf__ngram_range" : [(1,1), (1,2), (1,3), (2,2)]}
#grid_search = GridSearchCV(pipeline, param_grid , cv = 4)
#grid_search.fit(X_train, y_train)

**Grid Search Best Parameters And Scores**

In [None]:
#select best from gridsearch
#grid_search.best_params_
#grid_search.best_score_

**Making Predictions**

In [None]:
pipeline.fit(X, y)
y_pred = pipeline.predict(test_data)

**Classification Report**

In [None]:
#classification report
#CR = classification_report(y_test, y_pred)
#print('Classification report \n' )
#print(CR)

   **Confusion Matrix**

In [None]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#cm = confusion_matrix(y_test,y_pred)
#disp = ConfusionMatrixDisplay(confusion_matrix=cm)
#disp.plot()
#plt.show()

# **Submission**

In [None]:
sub = pd.DataFrame(y_pred, columns=['sentiment'])
sub.index.name = 'id'
sub.to_csv("submission.csv", encoding='utf-8')
output = pd.read_csv("submission.csv")