In [1]:
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

trainingSet = pd.read_csv("./data/train.csv")
testingSet = pd.read_csv("./data/test.csv")

print("train.csv shape is ", trainingSet.shape)
print("test.csv shape is ", testingSet.shape)

print()

print(trainingSet.head())
print()
print(testingSet.head())

print()

print(trainingSet.describe())

trainingSet['Score'].value_counts().plot(kind='bar', legend=True, alpha=.5)
plt.title("Count of Scores")
plt.show()

trainingSet['ProductId'].value_counts().nlargest(25).plot(kind='bar', legend=True, alpha=.5)
plt.title("Top 25 most rated Products")
plt.show()

trainingSet['ProductId'].value_counts().nsmallest(25).plot(kind='bar', legend=True, alpha=.5)
plt.title("Top 25 least rated Products")
plt.show()

trainingSet['UserId'].value_counts().nlargest(25).plot(kind='bar', legend=True, alpha=.5)
plt.title("Top 25 Reviewers")
plt.show()

trainingSet['UserId'].value_counts().nsmallest(25).plot(kind='bar', legend=True, alpha=.5)
plt.title("Lowest 25 Reviewers")
plt.show()

trainingSet[['Score', 'HelpfulnessNumerator']].groupby('Score').mean().plot(kind='bar', legend=True, alpha=.5)
plt.title("Mean Helpfulness Numerator per Score")
plt.show()

trainingSet[['Score', 'ProductId']].groupby('ProductId').mean().nlargest(25, 'Score').plot(kind='bar', legend=True, alpha=.5)
plt.title("Top 25 best rated Products")
plt.show()

trainingSet[['Score', 'ProductId']].groupby('ProductId').mean().nsmallest(25, 'Score').plot(kind='bar', legend=True, alpha=.5)
plt.title("Top 25 worst rated Products")
plt.show()

trainingSet[['Score', 'UserId']].groupby('UserId').mean().nlargest(25, 'Score').plot(kind='bar', legend=True, alpha=.5)
plt.title("Top 25 kindest Reviewers")
plt.show()

trainingSet[['Score', 'UserId']].groupby('UserId').mean().nsmallest(25, 'Score').plot(kind='bar', legend=True, alpha=.5)
plt.title("Top 25 harshest Reviewers")
plt.show()

trainingSet[trainingSet['ProductId'].isin(trainingSet['ProductId'].value_counts().nlargest(25).index.tolist())][['Score', 'ProductId']].groupby('ProductId').mean().plot(kind='bar', legend=True, alpha=.5)
plt.title("Mean of top 25 most rated Products")
plt.show()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


FileNotFoundError: [Errno 2] No such file or directory: './data/train.csv'

In [None]:
trainX = pd.read_csv('data/X_train.csv')
testX = pd.read_csv('data/X_test.csv')

In [None]:
# Replace NaN values with an empty string in 'Text' and 'Summary' columns
trainX['Text'].fillna('', inplace=True)
testX['Text'].fillna('', inplace=True)
trainX['Summary'].fillna('', inplace=True)
testX['Summary'].fillna('', inplace=True)

In [None]:
import re
def preprocess_2(text):
    text = str(text)
    text = re.sub(r'[0-9]', "", text)
    return text

In [None]:
trainX['Summary'] = trainX['Summary'].apply(preprocess_2)
trainX['Text'] = trainX['Text'].apply(preprocess_2)

In [None]:
X = trainX.drop(columns=['Score'])
y = trainX['Score']-1

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [9]:
text_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3), stop_words='english')
summary_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3), stop_words='english')

In [10]:
x_train_text = text_vectorizer.fit_transform(x_train['Text'])
x_test_text = text_vectorizer.transform(x_test['Text'])

In [11]:
x_train_summary = summary_vectorizer.fit_transform(x_train['Summary'])
x_test_summary = summary_vectorizer.transform(x_test['Summary'])

In [12]:
x_train_stacked = sparse.hstack([x_train_text, x_train_summary])
x_test_stacked = sparse.hstack([x_test_text, x_test_summary])

In [None]:
from xgboost import XGBRegressor
xgb_grid = XGBRegressor()

# Define the hyperparameters and their possible values for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    # Add more hyperparameters and their values as needed
}

# Create the Grid Search object
grid_search_xgb = GridSearchCV(estimator=xgb_grid, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit the Grid Search to the data
grid_search_xgb.fit(x_train_stacked, y_train)

In [None]:
best_params = grid_search_xgb.best_params_
best_xgb_model = grid_search_xgb.best_estimator_

In [None]:
#grid search xgb model
y_test_preds = best_xgb_model.predict(x_test_svd)
print("RMSE on testing set = ", mean_squared_error(y_test, y_test_preds)**0.5)

In [47]:

xgb = XGBRegressor()
xgb.fit(x_train_svd, y_train)

In [48]:
#xgb regressor
y_test_preds = xgb.predict(x_test_svd)
print("RMSE on testing set = ", mean_squared_error(y_test, y_test_preds)**0.5)

RMSE on testing set =  0.9338581347338172


In [52]:
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(x_train_stacked, y_train)

In [53]:
#lasso regressiob
y_test_preds = lasso.predict(x_test_stacked)
print("RMSE on testing set = ", mean_squared_error(y_test, y_test_preds)**0.5)

RMSE on testing set =  1.1916252680217625


In [44]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train_svd, y_train)

In [45]:
#linear regression
y_test_preds = lr.predict(x_test_svd)
print("RMSE on testing set = ", mean_squared_error(y_test, y_test_preds)**0.5)

RMSE on testing set =  0.9540656315158077


In [49]:
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(x_train_stacked, y_train)

In [51]:
#ridge regression
y_test_preds = ridge.predict(x_test_stacked)
print("RMSE on testing set = ", mean_squared_error(y_test, y_test_preds)**0.5)

RMSE on testing set =  0.853371721000955


In [85]:
#grid search using ridge

param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}
#creating the grid search
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
#fitting the grid search to the train data 
grid_search.fit(x_train_stacked, y_train)

45 fits failed out of a total of 105.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py", line 1131, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py", line 823, in fit
    raise Val

In [86]:
best_params = grid_search.best_params_
best_ridge_model = grid_search.best_estimator_

In [87]:
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'alpha': 10.0, 'solver': 'auto'}


In [88]:
y_pred_grid_search = best_ridge_model.predict(x_test_stacked) 
print("RMSE on testing set = ", mean_squared_error(y_test, y_pred_grid_search)**0.5)

RMSE on testing set =  0.843763386638613


In [77]:
import pickle
with open('ridge_movie_rating_v2.pkl', 'wb') as f:
    pickle.dump(best_ridge_model, f)

In [None]:
# import pickle
# with open('xgb_new.pkl', 'wb') as f:
#     pickle.dump(xgb, f)

In [31]:
X_submission = pd.read_csv("./data/X_test.csv")
X_submission['Summary'] = X_submission['Summary'].apply(preprocess_2)
X_submission['Text'] = X_submission['Text'].apply(preprocess_2)
X_submission['Text'].fillna('', inplace=True)
X_submission['Summary'].fillna('', inplace=True)

In [32]:
X_submission.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Helpfulness,ReviewLength,Score
0,786781,B0000VD02Y,A1UL8PS42M5DM8,1,7,1082332800,good scenery,ok story may little slow unrealistic actually ...,0.142857,54,
1,17153,0767823931,A2OP1HD9RGX5OW,3,6,1055376000,reminded childhood,wonderful film julia taking kid england pullin...,0.5,266,
2,1557328,B008JFUNTG,AY113687D8YK1,1,8,1377388800,hodgepodge concept taken four greek myth,movie interesting first one special effect fin...,0.125,134,
3,1242666,B001UWOLQG,A2MVTAEGBP08RB,0,1,1374710400,good suspense,series suspense well written actress unique qu...,0.0,41,
4,1359242,B003QS0E54,ALGAE0IGE4DBP,99,103,1276646400,finally intelligent idea,first season hunter released year ago like ma...,0.961165,279,


In [78]:
#creating the submission vectors using the text and summary tfidf vectorizers.
x_submission_text = text_vectorizer.transform(X_submission['Text'])
x_submission_summary = summary_vectorizer.transform(X_submission['Summary'])

In [79]:
#stacking the two vectors together
x_submission_stacked = sparse.hstack([x_submission_text, x_submission_summary])

In [80]:
#submission
submission = X_submission[['Id']].copy()
submission['Score'] = best_ridge_model.predict(x_submission_stacked) + 1  # Shift the labels back to the original range
submission.to_csv('./data/submission_v2.csv', index=False)