# Importing Libraries and files

Importing all the necessary libraries

In [None]:
import os
import csv
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

Code to ignore warinings

In [None]:
warnings.filterwarnings('ignore')

Reading input files

In [None]:
files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))

movies = pd.read_csv(files[0])
sample = pd.read_csv(files[1])
train = pd.read_csv(files[2])
test = pd.read_csv(files[3])

Reading the head of the `movies` file

In [None]:
movies.head()

Getting the descriptive statistics of the `movies` file

In [None]:
movies.info()

Reading the head of the `train` file

In [None]:
train.head()

Getting the descriptive statistics of the `train` file

In [None]:
train.info()

Shape of `train` data

In [None]:
train.shape

# Data Preprocessing

Preprocess the `boxOffice` column

In [None]:
movies['boxOffice'] = movies['boxOffice'].str.replace('[$]', '', regex=True)
movies['boxOffice'] = movies['boxOffice'].apply(lambda x: float(x.replace('M', '')) * 1000000 if 'M' in str(x) else x)
movies['boxOffice'] = movies['boxOffice'].apply(lambda x: float(x.replace('K', '')) * 1000 if 'K' in str(x) else x)
movies['boxOffice'] = pd.to_numeric(movies['boxOffice'])

Merge the `movies` and `train` datasets

In [None]:
train = train.merge(movies, on='movieid', how='left')

Printing the head of the newly merged dataset

In [None]:
train.head()

`movieid` and `title` column seems to be containing similar texts. So it won't hurt much it we drop the title column

In [None]:
train = train.drop('title', axis=1)

Checking for relation between `isFrequentReviewer` and `sentiment`

In [None]:
sns.countplot(x='isFrequentReviewer', hue='sentiment', data=train)
plt.xlabel('Is Frequent Reviewer')
plt.ylabel('Sentiment')
plt.title('Count Plot of Sentiment by Is Frequent Reviewer')
plt.show()

Checking for relationship between `audienceScore` and `Sentiment`

In [None]:
plt.hist(train.loc[train['sentiment'] == 'POSITIVE', 'audienceScore'].dropna(), bins=20, alpha=0.5, label='Positive Sentiment')
plt.hist(train.loc[train['sentiment'] == 'NEGATIVE', 'audienceScore'].dropna(), bins=20, alpha=0.5, label='Negative Sentiment')
plt.xlabel('Audience Score')
plt.ylabel('Count')
plt.title('Histogram of Audience Score by Sentiment')
plt.legend()
plt.show()

Countplot to see if `ratings` affect `sentiments`

In [None]:
sns.countplot(x='rating', hue='sentiment', data=train)
plt.xlabel('Is Frequent Reviewer')
plt.ylabel('Sentiment')
plt.title('Count Plot of Sentiment by Rating')
plt.show()

Histograms to see how `runtimeMinutes` affect `sentiments`

In [None]:
plt.hist(train.loc[train['sentiment'] == 'POSITIVE', 'runtimeMinutes'].dropna(), bins=30, alpha=0.5, label='Positive Sentiment')
plt.hist(train.loc[train['sentiment'] == 'NEGATIVE', 'runtimeMinutes'].dropna(), bins=30, alpha=0.5, label='Negative Sentiment')
plt.xlabel('Runtime Minutes')
plt.ylabel('Count')
plt.title('Histogram of Runtime Minutes by Sentiment')
plt.legend()
plt.show()

Countplot for `sentiment` vs `genre`

In [None]:
temp = train
temp = temp.assign(genre=temp['genre'].str.split(', ')).explode('genre')

sns.countplot(x='genre', hue='sentiment', data=temp)
plt.xlabel('Genre')
plt.ylabel('Count')
plt.title('Count Plot of Sentiment by Genre')
plt.xticks(rotation=90)
plt.show()

Histogram for `boxOffice` vs `sentiment`

In [None]:
plt.hist(train.loc[train['sentiment'] == 'POSITIVE', 'boxOffice'].dropna(), bins=10, alpha=0.5, label='Positive Sentiment')
plt.hist(train.loc[train['sentiment'] == 'NEGATIVE', 'boxOffice'].dropna(), bins=10, alpha=0.5, label='Negative Sentiment')
plt.xlabel('Box Office')
plt.ylabel('Count')
plt.title('Histogram of Box Office by Sentiment')
plt.legend()
plt.show()

Countplot for `sentiment` vs `soundType`

In [None]:
temp = train
temp = temp.assign(soundType=temp['soundType'].str.split(', ')).explode('soundType')

sns.countplot(x='soundType', hue='sentiment', data=temp)
plt.xlabel('Sound Type')
plt.ylabel('Count')
plt.title('Count Plot of Sentiment by Sound Type')
plt.xticks(rotation=90)
plt.show()

Droping the rows which do not contain any `reviewText`

In [None]:
train.dropna(subset=['reviewText'], inplace=True)

Seperating features and target

In [None]:
X = train.drop('sentiment', axis=1)
y = train['sentiment']

Encoding the target column

In [None]:
y = OneHotEncoder().fit_transform(y.values.reshape(-1, 1))

Spliting data into test and train

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

Preprocessing the test data

In [None]:
test['isFrequentReviewer'] = test['isTopCritic']
test = test.drop('isTopCritic', axis=1)
test['reviewText'] = test['reviewText'].fillna('')
temp_movies = movies.drop_duplicates(subset=['movieid'])
X_test = test.merge(temp_movies, on='movieid', how='left')

Define the feature columns

In [None]:
text_cols = 'reviewText'
num_cols = ['audienceScore', 'runtimeMinutes', 'boxOffice']
cat_cols = ['movieid', 'reviewerName', 'rating', 'isFrequentReviewer', 'genre', 'originalLanguage', 'director', 'distributor', 'ratingContents', 'releaseDateTheaters', 'releaseDateStreaming', 'soundType']

Define the preprocessing pipelines for numerical and categorical data

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
text_pipeline = Pipeline([
    ('vectorizer', CountVectorizer())
])

Combine the preprocessing pipelines using ColumnTransformer

In [None]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols),
    ('text', text_pipeline, text_cols)
])

# Dummy Classifier

Preprocessing the trianing data

In [None]:
X_train = preprocessor.fit_transform(X_train)

Fitting a `DummyClassifier`

In [None]:
dummy_classifier = DummyClassifier()
dummy_classifier.fit(X_train, y_train)
y_pred = dummy_classifier.predict(X_test)

# LogisticRegression

Define the classification pipeline.

In [None]:
log_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

Spliting data into test and train again for `LogisticRegression`

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

Fitting the `LogisticRegression` Model

In [None]:
log_clf.fit(X_train, y_train)

Predict the sentiment on the validation data

In [None]:
y_pred = log_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

Retraining the `log_clf` on full dataset

In [None]:
log_clf.fit(X, y)

Generating the test output

In [None]:
y_pred = log_clf.predict(X_test)

**Hyperparameter Tuning**

Defining the parameter grid

In [None]:
param_grid = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__fit_intercept': [True, False],
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'classifier__max_iter': [100, 500, 1000]
}

Defining the `grid_search`

In [None]:
grid_search = GridSearchCV(log_clf, param_grid, cv=5)

Fitting the first 1000 rows to speed up the searching process

In [None]:
grid_search.fit(X_train.head(1000), y_train.head(1000))

Getting the values of best parameters

In [None]:
grid_search.best_params_

Redefining the `log_clf` on the new found hyperparameters

In [None]:
log_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(C=1, fit_intercept=True, max_iter=100, penalty='l2', solver='newton-cg'))
])

Fitting the `log_clf`

In [None]:
log_clf.fit(X_train, y_train)

Getting predictions on the validation set

In [None]:
y_pred = log_clf.predict(X_valid)

Getting the accuracy score on the validation set

In [None]:
accuracy_score(y_valid, y_pred)

Retraining on the whole dataset

In [None]:
log_clf.fit(X, y)

Prediction on the test set for output

In [None]:
y_pred = log_clf.predict(X_test)

# KNeighborsClassifier

Creating a pipeline for `KNN` classifier

In [None]:
knn_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('truncatedsvd', TruncatedSVD(n_components=500)),
    ('knn_classifier', KNeighborsClassifier(n_jobs=-1))
])

Fitting the `KNeighborsClassifier`

In [None]:
knn_clf.fit(X_train, y_train)

Predicting the sentiments on validation set using KNN Classifier

In [None]:
y_pred = knn_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

**Hyperparameter Tuning**

Defining the parameter grid

In [None]:
param_grid = {
    'knn_classifier__n_neighbors': [3, 5, 7, 9, 11],
    'knn_classifier__weights': ['uniform', 'distance'],
    'knn_classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn_classifier__leaf_size': [10, 20, 30, 40, 50],
    'knn_classifier__p': [1, 2]
}

Defining `grid_search`

In [None]:
grid_search = GridSearchCV(knn_clf, param_grid, cv=5)

Fitting only 100 samples for `grid_search` to reduce searching time

In [None]:
grid_search.fit(X_train.head(100), y_train.head(100))

Getting the values for best parameters

In [None]:
grid_search.best_params_

Redefining the `knn_clf` pipeline with tuned parameters

In [None]:
knn_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('truncatedsvd', TruncatedSVD(n_components=500)),
    ('knn_classifier', KNeighborsClassifier(n_jobs=-1, algorithm='auto', leaf_size=10, n_neighbors=3, p=2, weights='uniform'))
])

Fitting on the training data

In [None]:
knn_clf.fit(X_train, y_train)

Checking accuracy on validation set

In [None]:
y_pred = knn_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

# LinearSVC

Creating a pipeline for `LinearSVC`

In [None]:
svc_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_svc', LinearSVC())
])

Fitting `LinearSVC` model

In [None]:
svc_clf.fit(X_train, y_train)

Checking the accuracy on the validation set

In [None]:
y_pred = svc_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

Fitting the model on full dataset

In [None]:
svc_clf.fit(X, y)

Predicting on the test data

In [None]:
y_pred = svc_clf.predict(X_test)

**Hyperperameter Tuning**

Creating a parameter grid for hyperparameter tuning

In [None]:
param_grid = {
    'linear_svc__penalty': ['l1', 'l2'],
    'linear_svc__loss': ['hinge', 'squared_hinge'],
    'linear_svc__dual': [True, False],
    'linear_svc__tol': [1e-4, 1e-3, 1e-2],
    'linear_svc__C': [0.1, 1, 10, 100],
    'linear_svc__multi_class': ['ovr', 'crammer_singer'],
    'linear_svc__fit_intercept': [True, False],
    'linear_svc__intercept_scaling': [0.5, 1, 2],
    'linear_svc__class_weight': [None, 'balanced']
}

Creating a `grid_search` instance

In [None]:
grid_search = GridSearchCV(svc_clf, param_grid, cv=5)

Fitting the first 1000 datapoints to decrease searching time

In [None]:
grid_search.fit(X_train.head(1000), y_train.head(1000))

The values of best parameters

In [None]:
grid_search.best_params_

Redefining the pipeline with the above hyperparameters

In [None]:
svc_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_svc', LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', multi_class='ovr', penalty='l2', tol=0.01))
])

Fitting on the training dataset

In [None]:
svc_clf.fit(X_train, y_train)

Checking the accuracy on the validation set

In [None]:
y_pred = svc_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

# DecisionTreeClassifier

Creating a pipeline for `DecisionTreeClassifier`

In [None]:
tree_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('decision_tree', DecisionTreeClassifier())
])

Training the model

In [None]:
tree_clf.fit(X_train, y_train)

Getting the predictions and checking accuracy

In [None]:
y_pred = tree_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

**Hyperparameter Tuning**

Redefing the `DecisionTreeClassifier` pipeline with dimentionality reduction to decrease searching time

In [None]:
tree_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('truncatedsvd', TruncatedSVD(n_components=500)),
    ('decision_tree', DecisionTreeClassifier())
])

Defining a parameter grid

In [None]:
param_grid = {
    'decision_tree__criterion': ['gini', 'entropy'],
    'decision_tree__splitter': ['best', 'random'],
    'decision_tree__max_depth': [None, 1, 2, 3, 4, 5],
    'decision_tree__min_samples_split': [2, 3, 4],
    'decision_tree__min_samples_leaf': [1, 2, 3],
    'decision_tree__min_weight_fraction_leaf': [0.0, 0.1, 0.2],
    'decision_tree__max_features': [None, 'auto', 'sqrt', 'log2']
}

Defining `GridSearchCV`

In [None]:
grid_search = GridSearchCV(tree_clf, param_grid, cv=3)

Fitting the first 100 datapoints for increasing the speed of searching

In [None]:
grid_search.fit(X_train.head(100), y_train.head(100))

Getting the best parameter values

In [None]:
grid_search.best_params_

Redefining the pipeline with the new hyperparameter values

In [None]:
tree_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('decision_tree', DecisionTreeClassifier(criterion='entropy', max_depth=2, max_features=None, min_samples_leaf=3, min_weight_fraction_leaf=0.1, splitter='random'))
])

Fitting on the training dataset

In [None]:
tree_clf.fit(X_train, y_train)

Getting the accuracy score

In [None]:
y_pred = tree_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

# Bagging

Fitting the preprocessor with the training data

In [None]:
preprocessor.fit(X_train)

Transforming `X_train` using preprocessor pipeline

In [None]:
X_train = preprocessor.transform(X_train)

**LinearSVC**

Creating a `BaggingClassifier` using the `LinearSVC`

In [None]:
linear_svc = LinearSVC()
bagging = BaggingClassifier(linear_svc, n_estimators=10, max_samples=0.8, n_jobs=-1)

Fitting and timing the `BaggingClassifier`

In [None]:
bagging.fit(X_train, y_train)

Transforming and reducing the dimentions of the validation set

In [None]:
X_valid = preprocessor.transform(X_valid)

Checking the score on the validation set

In [None]:
y_pred = bagging.predict(X_valid)
accuracy_score(y_valid, y_pred)

Transforming the full dataset using the `preprocessor` pipeline

In [None]:
X_full = preprocessor.transform(X)

Fitting the full dataset on the bagging classifier

In [None]:
bagging.fit(X_full, y)

Transforming the test dataset

In [None]:
X_test_pro = preprocessor.transform(X_test)

Prediction on the test dataset

In [None]:
y_pred = bagging.predict(X_test_pro)

**LogisticRegression**

Defining `BaggingClassifier` for `LogisticRegression`

In [None]:
log_reg = LogisticRegression()
bagging = BaggingClassifier(log_reg, n_estimators=100, max_samples=0.8, n_jobs=-1)

Fitting on training data

In [None]:
bagging.fit(X_train, y_train)

Getting predictions on the validation set

In [None]:
y_pred = bagging.predict(X_valid)
accuracy_score(y_valid, y_pred)

Fitting on the full dataset

In [None]:
bagging.fit(X_full, y)

Prediction on the test dataset

In [None]:
y_pred = bagging.predict(X_test_pro)

# Boosting

**LinearSVC**

Creating a `AdaBoostClassifier` instance for `LinearSVC`

In [None]:
boost_clf = AdaBoostClassifier(linear_svc, n_estimators=10, algorithm='SAMME')

Timing and fitting the training dataset

In [None]:
boost_clf.fit(X_train, y_train)

Prediction and scoring on validation set

In [None]:
y_pred = boost_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

**LogisticRegression**

Defining `AdaBoostClassifier` for `LogisticRegression`

In [None]:
boost_clf = AdaBoostClassifier(log_reg, n_estimators=10, algorithm='SAMME')

Fitting on the training dataset

In [None]:
boost_clf.fit(X_train, y_train)

Getting predictions on the validation data

In [None]:
y_pred = boost_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

# MLPClassifier

Redefing the train and test set for `MLPClassifier`

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

Defining the pipeline for `MLPClassifier`

In [None]:
mlp_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('truncated_svd', TruncatedSVD(n_components=500)),
    ('mlp_classification', MLPClassifier())
])

Timing and fitting the model

In [None]:
%time mlp_clf.fit(X_train, y_train)

Predictions on the validation set

In [None]:
y_pred = mlp_clf.predict(X_valid)
f1_score(y_valid, y_pred, average='micro')

Predictions on the test set

In [None]:
y_pred = mlp_clf.predict(X_test)

Generating the submissions.csv file

In [None]:
result = []
y_pred = one_hot_encoder.inverse_transform(y_pred)

for i in range(len(y_pred)):
    result.append({
        'id': i,
        'sentiment': y_pred[i]
    })
    
filename = 'submission.csv'
fields = ['id', 'sentiment']
with open(filename, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()
    writer.writerows(result)