Drive mouniting


In [None]:
from google.colab import drive

# This will prompt for authorization and mount your Google Drive.
drive.mount('/content/drive')




Mounted at /content/drive


In [None]:
# Unzipping the file
!gunzip "{file_path}"


gzip: {file_path}.gz: No such file or directory


Loading the data set


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Load dataset
file_path = '/content/drive/My Drive/Toys_and_Games_5.json'
df = pd.read_json(file_path, lines=True)

Handling missing values for summary reviewer id and asin id

In [None]:
df['summary'] = df['summary'].fillna('no summary')
df['reviewerID'] = df['reviewerID'].fillna('unknown')
df['asin'] = df['asin'].fillna('unknown')

Performing text cleaning by removing stop words and converting string to lower case

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # remove all non-word characters
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if not word in stop_words]
    return ' '.join(filtered_text)

df['clean_summary'] = df['summary'].apply(clean_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


 TF-IDF Vectorization

In [None]:
tfidf = TfidfVectorizer(max_features=100, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['clean_summary'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

 Encode categorical variables

In [None]:
le = LabelEncoder()
df['reviewerID_encoded'] = le.fit_transform(df['reviewerID'])
df['asin_encoded'] = le.fit_transform(df['asin'])



Combine all features into one DataFrame

In [None]:
df_final = pd.concat([df[['overall', 'verified']], tfidf_df], axis=1)
df_final['verified'] = df_final['verified'].astype(int)

x = df_final.drop('overall', axis=1)
y = df_final['overall'].astype(float)

Random forest Regressor

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

# Initialize models
rf_model = RandomForestRegressor(n_estimators=1, random_state=42,verbose = 1)

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and calculate RMSE for RandomForest
rf_mse = cross_val_score(rf_model, x, y, cv=kf, scoring='neg_mean_squared_error',verbose = 1)


In [None]:
rf_rmse = np.sqrt(-rf_mse)
print("Random Forest RMSE scores for each fold:", rf_rmse)
print("Minimum RF RMSE:", min(np.abs(rf_rmse)))
print("Random Forest RMSE scores for each fold:", rf_mse)
print("Minimum RF MSE:", min(np.abs(rf_mse)))
print("Average RMSE:", np.mean(rf_rmse))

Random Forest RMSE scores for each fold: [0.87703552 0.87668735 0.87794506 0.87747315 0.87903336]
Minimum RF RMSE: 0.8766873481850777
Random Forest RMSE scores for each fold: [-0.7691913  -0.76858071 -0.77078754 -0.76995914 -0.77269965]
Minimum RF MSE: 0.7685807064677835
Average RMSE: 0.8776348895973216


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error

# Initialize models
rf_model = RandomForestRegressor(n_estimators=5, random_state=42,verbose = 1)

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and calculate RMSE for RandomForest
rf_mse = cross_val_score(rf_model, x, y, cv=kf, scoring='neg_mean_squared_error',verbose = 1)


In [None]:
rf_rmse = np.sqrt(-rf_mse)
print("Random Forest RMSE scores for each fold:", rf_rmse)
print("Minimum RF RMSE:", min(np.abs(rf_rmse)))
print("Random Forest RMSE scores for each fold:", rf_mse)
print("Minimum RF MSE:", min(np.abs(rf_mse)))
print("Average RMSE:", np.mean(rf_rmse))

Random Forest RMSE scores for each fold: [0.86545623 0.86484037 0.86704684 0.86556861 0.86704871]
Minimum RF RMSE: 0.864840373282365
Random Forest RMSE scores for each fold: [-0.74901449 -0.74794887 -0.75177022 -0.74920902 -0.75177346]
Minimum RF MSE: 0.7479488712591804
Average RMSE: 0.8659921521627035


Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
lr_model = LinearRegression()

# Perform cross-validation and calculate RMSE for Linear Regression
lr_mse = cross_val_score(lr_model, x, y, cv=kf, scoring='neg_mean_squared_error', verbose=1)
lr_rmse = np.sqrt(-lr_mse)

In [None]:
rf_rmse = np.sqrt(-rf_mse)
print("Linear regression RMSE scores for each fold:", lr_rmse)
print("Minimum LR RMSE:", min(np.abs(lr_rmse)))
print("Linear regression RMSE scores for each fold:", lr_mse)
print("Minimum LR MSE:", min(np.abs(lr_mse)))
print("Average LR RMSE:", np.mean(lr_rmse))

Linear regression RMSE scores for each fold: [0.87611804 0.87490891 0.87738844 0.87558865 0.87758545]
Minimum LR RMSE: 0.874908907988132
Linear regression RMSE scores for each fold: [-0.76758283 -0.7654656  -0.76981048 -0.76665548 -0.77015623]
Minimum LR MSE: 0.7654655972769856
Average LR RMSE: 0.8763178994153137


XG Boost Model

In [None]:
# XGBoost Model
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
xg_model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=5, random_state=42)
param_grid_xg = {
    'n_estimators': [5, 10, 15],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.5, 0.7, 1]
}
grid_search_xg = GridSearchCV(estimator=xg_model, param_grid=param_grid_xg, cv=kf, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search_xg.fit(x, y)
best_xg = grid_search_xg.best_estimator_
xg_rmse = np.sqrt(-cross_val_score(best_xg, x, y, cv=kf, scoring='neg_mean_squared_error', verbose=1))
print("XGBoost RMSE scores for each fold:", xg_rmse)
print("Average XGBoost RMSE:", np.mean(xg_rmse))

Fitting 5 folds for each of 54 candidates, totalling 270 fits
XGBoost RMSE scores for each fold: [0.91101238 0.90958526 0.91188449 0.90856253 0.91111231]
Average XGBoost RMSE: 0.9104313941566133


Optimizing RMSE score using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=4,
                           scoring='neg_mean_squared_error', n_jobs=3, verbose=1)
grid_search.fit(x, y)
best_rf = grid_search.best_estimator_
best_rf_mse = cross_val_score(best_rf, x, y, cv=4, scoring='neg_mean_squared_error')
best_rf_rmse = np.sqrt(-best_rf_mse)
print("Improved Random Forest RMSE scores for each fold:", best_rf_rmse)
print("Improved Average RF RMSE:", np.mean(best_rf_rmse))

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Improved Random Forest RMSE scores for each fold: [0.88934329 0.86134032 0.85136732 0.85988623]
Improved Average RF RMSE: 0.8654842888958264
