# CSC 2515 Kaggle Competition

## Import Libraries

In [1]:
import json
import pandas as pd
import numpy as np
import string
import re
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import regularizers
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## Load data

### Training data

In [2]:
train = []
for line in open('train.json', 'r'):
    train.append(json.loads(line))
df = pd.DataFrame(train)
df = df.drop(['image'], axis=1)
df = df.fillna('') # fill NaN with ''
df = df.reset_index(drop=True)
df

Unnamed: 0,overall,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,price,itemID,reviewHash
0,4.0,"08 24, 2010",u04428712,"So is Katy Perry's new album ""Teenage Dream"" c...",Amazing that I Actually Bought This...More Ama...,1282608000,Pop,$35.93,p70761125,85559980
1,5.0,"10 31, 2009",u06946603,"I got this CD almost 10 years ago, and given t...",Excellent album,1256947200,Alternative Rock,$11.28,p85427891,41699565
2,4.0,"10 13, 2015",u92735614,I REALLY enjoy this pairing of Anderson and Po...,"Love the Music, Hate the Light Show",1444694400,Pop,$89.86,p82172532,24751194
3,5.0,"06 28, 2017",u35112935,Finally got it . It was everything thought it ...,Great,1498608000,Pop,$11.89,p15255251,22820631
4,4.0,"10 12, 2015",u07141505,"Look at all star cast. Outstanding record, pl...",Love these guys.,1444608000,Jazz,$15.24,p82618188,53377470
...,...,...,...,...,...,...,...,...,...,...
199995,4.0,"05 1, 2004",u68902609,"With this, Mariah's third album, Mariah proved...",Well Done Mariah! You Show 'Em!,1083369600,Pop,$7.98,p84118731,35077372
199996,5.0,"02 27, 2017",u15269603,Fantastic CD. All the hits are here and even ...,"Great collection, excellent sound!",1488153600,Pop,$11.49,p08613950,09788722
199997,3.0,"03 1, 2011",u25124021,"This recording is rather disappointing, to a c...",Odd Couplings,1298937600,Classical,$13.57,p25341819,71627957
199998,5.0,"03 20, 2016",u04485604,Get it now ! Right now ! I am partial. I am a ...,Our Poet,1458432000,Alternative Rock,$11.07,p19134748,27463540


### Test data

In [3]:
test = []
for line in open('test.json', 'r'):
    test.append(json.loads(line))
df_pre = pd.DataFrame(test)
df_pre = df_pre.drop(['image'], axis=1)
df_pre = df_pre.fillna('') # fill NaN with ''
df_pre

Unnamed: 0,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,price,itemID,reviewHash
0,"03 26, 2015",u32476110,"Fantastic mix of ""old school"" with a creative ...","Fantastic mix of ""old school"" with a creative ...",1427328000,Pop,$14.98,p76243483,20167847
1,"05 15, 2017",u36732410,Update: Indications\nThere are various opinion...,Digitally Extracted Stereo (DES) Rules!,1494806400,Pop,$15.16,p92485419,30527605
2,"06 4, 2015",u85385007,This album provides a new twist on old Sammy H...,Excellent unplugged album,1433376000,Pop,$7.37,p40031588,12169432
3,"04 23, 2009",u30715529,(Symbol) can be considered as another masterpi...,another masterpiece,1240444800,Pop,$12.45,p88719785,55648615
4,"09 15, 2000",u95909892,Many would think this album is good only becau...,True Classic Rock,968976000,Alternative Rock,$2.07,p59188380,09520938
...,...,...,...,...,...,...,...,...,...
9995,"05 5, 2004",u75493520,Covering most of No Doubt's most popular songs...,3 1/2 stars for a really good compilation,1083715200,Alternative Rock,$5.97,p47578844,83258532
9996,"12 19, 2015",u85298918,Good Quality,Three Stars,1450483200,Alternative Rock,$8.45,p49438960,05929877
9997,"01 31, 2012",u73735798,Najee's album are great. I have yet to see hi...,Glad to have another gem,1327968000,Jazz,$17.31,p85068973,45399990
9998,"08 25, 2015",u67631525,I bought this for Get It On and I like that song.,Three Stars,1440460800,Alternative Rock,$2.07,p59188380,66288872


## Data preprocessing

#### price

In [4]:
df['price'] = df['price'].str.replace('$', '')
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['price'].fillna(df['price'].mode()[0],inplace=True) # fill NaN with mode

# pre
df_pre['price'] = df_pre['price'].str.replace('$', '')
df_pre['price'] = pd.to_numeric(df_pre['price'], errors='coerce')
df_pre['price'].fillna(df_pre['price'].mode()[0],inplace=True) # fill NaN with mode

#### category

In [5]:
df = pd.get_dummies(data=df, columns=['category'])

# pred
df_pre = pd.get_dummies(data=df_pre, columns=['category'])

## Feature engineering

#### unixReviewTime

In [6]:
df['year'] = pd.to_numeric(df['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x).strftime('%Y')), errors='coerce')
df['month'] = pd.to_numeric(df['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x).strftime('%m')), errors='coerce')
df['day'] = pd.to_numeric(df['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x).strftime('%d')), errors='coerce') 
df['hour'] = pd.to_numeric(df['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x).strftime('%H')) , errors='coerce')
df['day_of_week'] = df['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x/1000).strftime("%A"))

# pred
df_pre['year'] = pd.to_numeric(df_pre['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x).strftime('%Y')), errors='coerce') 
df_pre['month'] = pd.to_numeric(df_pre['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x).strftime('%m')) , errors='coerce')
df_pre['day'] = pd.to_numeric(df_pre['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x).strftime('%d')), errors='coerce') 
df_pre['hour'] = pd.to_numeric(df_pre['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x).strftime('%H')), errors='coerce')
df_pre['day_of_week'] = df_pre['unixReviewTime'].apply(lambda x: datetime.fromtimestamp(x/1000).strftime("%A"))

In [7]:
df = pd.get_dummies(data=df, columns=['day_of_week'])

# pred
df_pre = pd.get_dummies(data=df_pre, columns=['day_of_week'])

### Text data clean

#### reviewText

In [8]:
%%time
df['reviewText_clean'] = df['reviewText'].astype(str)
df['reviewText_clean'] = df['reviewText_clean'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['reviewText_clean'] = df['reviewText_clean'].str.replace('[^\w\s]','')
stop = stopwords.words('english')
df['reviewText_clean'] = df['reviewText_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
lem = WordNetLemmatizer()
df['reviewText_clean'] = df['reviewText_clean'].apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split()]))
st = PorterStemmer()
df['reviewText_clean'] = df['reviewText_clean'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

# pred
df_pre['reviewText_clean'] = df_pre['reviewText'].astype(str)
df_pre['reviewText_clean'] = df_pre['reviewText_clean'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_pre['reviewText_clean'] = df_pre['reviewText_clean'].str.replace('[^\w\s]','')
df_pre['reviewText_clean'] = df_pre['reviewText_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df_pre['reviewText_clean'] = df_pre['reviewText_clean'].apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split()]))
df_pre['reviewText_clean'] = df_pre['reviewText_clean'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

Wall time: 6min 39s


#### summary

In [9]:
df['summary_clean'] = df['summary'].astype(str)
df['summary_clean'] = df['summary_clean'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['summary_clean'] = df['summary_clean'].str.replace('[^\w\s]','')
df['summary_clean'] = df['summary_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['summary_clean'] = df['summary_clean'].apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split()]))
df['summary_clean'] = df['summary_clean'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

# pred
df_pre['summary_clean'] = df_pre['summary'].astype(str)
df_pre['summary_clean'] = df_pre['summary_clean'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_pre['summary_clean'] = df_pre['summary_clean'].str.replace('[^\w\s]','')
df_pre['summary_clean'] = df_pre['summary_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df_pre['summary_clean'] = df_pre['summary_clean'].apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split()]))
df_pre['summary_clean'] = df_pre['summary_clean'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

### Sentiment score

In [None]:
'''
%%time
def textblob_senti_polarity(x):
    return TextBlob(x).sentiment.polarity  

df['polar_review'] = df['reviewText_clean'].apply(textblob_senti_polarity)
df['polar_summary'] = df['summary_clean'].apply(textblob_senti_polarity)
# pred
df_pre['polar_review'] = df_pre['reviewText_clean'].apply(textblob_senti_polarity)
df_pre['polar_summary'] = df_pre['summary_clean'].apply(textblob_senti_polarity)
'''

In [10]:
%%time
SIA = SentimentIntensityAnalyzer()

df['polar_review'] = df['reviewText_clean'].apply(lambda x: SIA.polarity_scores(x)['compound'])
df['polar_summary'] = df['summary_clean'].apply(lambda x: SIA.polarity_scores(x)['compound'])
# pred
df_pre['polar_review'] = df_pre['reviewText_clean'].apply(lambda x: SIA.polarity_scores(x)['compound'])
df_pre['polar_summary'] = df_pre['summary_clean'].apply(lambda x: SIA.polarity_scores(x)['compound'])

Wall time: 5min 9s


### Data split

In [11]:
df_target = df['overall'] 
df_train = df.drop(['overall'], axis = 1)
feature_train, feature_test, y_train, y_test = train_test_split(
    df_train, df_target.values, random_state=42, test_size=0.3)
feature_train = feature_train.reset_index(drop=True)
feature_test = feature_test.reset_index(drop=True)

#### reviewText

In [12]:
vectorizer = CountVectorizer(max_features = 3000, ngram_range = (1,2)).fit(feature_train['reviewText_clean']) 

ma_r_train = vectorizer.transform(feature_train['reviewText_clean'])
review_train = pd.DataFrame(data = ma_r_train.toarray(), columns = vectorizer.get_feature_names())

# test
ma_r_test = vectorizer.transform(feature_test['reviewText_clean'])
review_test = pd.DataFrame(data = ma_r_test.toarray(), columns = vectorizer.get_feature_names())

# pred
ma_r_pre = vectorizer.transform(df_pre['reviewText_clean'])
df_review_pre = pd.DataFrame(data = ma_r_pre.toarray(), columns = vectorizer.get_feature_names())

#### summary

In [13]:
s_vectorizer =  CountVectorizer( max_features = 3000, ngram_range = (1,2)).fit(feature_train['summary_clean'])

ma_s_train = s_vectorizer.transform(feature_train['summary_clean'])
summary_train =  pd.DataFrame(data = ma_s_train.toarray(), columns = s_vectorizer.get_feature_names())

# test
ma_s_test = s_vectorizer.transform(feature_test['summary_clean'])
summary_test =  pd.DataFrame(data = ma_s_test.toarray(), columns = s_vectorizer.get_feature_names())

# pred
ma_s_pre = s_vectorizer.transform(df_pre['summary_clean'])
df_summary_pre =  pd.DataFrame(data = ma_s_pre.toarray(), columns = s_vectorizer.get_feature_names())

### Combine features

In [14]:
X_train = feature_train.drop(['reviewTime','reviewerID','reviewText','summary','unixReviewTime','itemID',
                              'reviewHash','reviewText_clean','summary_clean'], axis=1)
X_train = pd.concat([X_train,review_train,summary_train], axis = 1)

# test
X_test = feature_test.drop(['reviewTime','reviewerID','reviewText','summary','unixReviewTime','itemID',
                            'reviewHash','reviewText_clean','summary_clean'], axis=1)
X_test = pd.concat([X_test,review_test,summary_test], axis = 1)

# pred
df_feature_pre = df_pre.drop(['reviewTime','reviewerID','reviewText','summary','unixReviewTime','itemID',
                              'reviewHash','reviewText_clean','summary_clean'], axis=1)
df_feature_pre = pd.concat([df_feature_pre,df_review_pre,df_summary_pre], axis = 1)

In [20]:
X_train

Unnamed: 0,price,category_Alternative Rock,category_Classical,category_Dance & Electronic,category_Jazz,category_Pop,year,month,day,hour,...,young,your,youth,youv,youv never,yuck,zappa,zero,zombi,zone
0,12.31,1,0,0,0,0,2015,12,5,19,...,0,0,0,0,0,0,0,0,0,0
1,14.94,0,0,1,0,0,2011,5,3,20,...,0,0,0,0,0,0,0,0,0,0
2,7.28,0,0,0,0,1,2001,11,11,19,...,0,0,0,0,0,0,0,0,0,0
3,9.99,0,0,0,0,1,2014,10,7,20,...,0,0,0,0,0,0,0,0,0,0
4,11.98,0,0,0,1,0,2015,7,4,20,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139995,10.95,0,0,0,1,0,2015,3,27,20,...,0,0,0,0,0,0,0,0,0,0
139996,10.99,1,0,0,0,0,2017,6,6,20,...,0,0,0,0,0,0,0,0,0,0
139997,5.12,1,0,0,0,0,2002,9,13,20,...,0,0,0,0,0,0,0,0,0,0
139998,11.11,1,0,0,0,0,2006,6,16,20,...,0,0,0,0,0,0,0,0,0,0


In [71]:
X_test

Unnamed: 0,price,category_Alternative Rock,category_Classical,category_Dance & Electronic,category_Jazz,category_Pop,year,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,...,worth wait,worthy,wow,wrong,yeah,year.1,years,yes,york,young
0,13.98,0,0,0,0,1,2006,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,14.95,0,0,0,0,1,2017,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11.56,0,0,0,0,1,2012,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,12.97,1,0,0,0,0,2004,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,15.77,0,1,0,0,0,2006,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,22.87,0,0,0,0,1,2011,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,1.13,1,0,0,0,0,2004,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59997,3.76,0,1,0,0,0,2003,0,1,0,...,0,0,0,0,0,0,0,0,0,0
59998,5.58,0,0,0,0,1,2003,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
df_feature_pre

Unnamed: 0,price,category_Alternative Rock,category_Classical,category_Dance & Electronic,category_Jazz,category_Pop,year,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,...,worth wait,worthy,wow,wrong,yeah,year.1,years,yes,york,young
0,14.98,0,0,0,0,1,2015,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,15.16,0,0,0,0,1,2017,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7.37,0,0,0,0,1,2015,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,12.45,0,0,0,0,1,2009,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.07,1,0,0,0,0,2000,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,5.97,1,0,0,0,0,2004,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,8.45,1,0,0,0,0,2015,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9997,17.31,0,0,0,1,0,2012,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,2.07,1,0,0,0,0,2015,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Model implementation

### Scale data

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_pre = scaler.transform(df_feature_pre)

### Multi-layer perceptron

In [None]:
'''
mlp = MLPRegressor(alpha=1e-5, hidden_layer_sizes=(10,), random_state=1, max_iter = 1000) 
mlp.fit(X_train_scaled,y_train)
y_pred = mlp.predict(X_test_scaled)
print("MSE on test set is",mean_squared_error(y_test, y_pred))
'''

### Gradient Boosting

In [None]:
'''
gbr = GradientBoostingRegressor(random_state=0,max_depth=12)
gbr.fit(X_train_scaled,y_train)
y_pred = gbr.predict(X_test_scaled)
print("MSE on test set is",mean_squared_error(y_test, y_pred))
'''

### Random forest

In [None]:
'''
rf = RandomForestRegressor(max_depth=20)
rf.fit(X_train_scaled,y_train)
y_pred = rf.predict(X_test_scaled)
print("MSE on test set is",mean_squared_error(y_test, y_pred))
'''

### Deep neural networks (Optimal)

In [19]:
nn = Sequential()
nn.add(Dense(64, activation='relu',kernel_regularizer=regularizers.l2(0.0001)))
nn.add(Dropout(0.3))
nn.add(Dense(64, activation='relu',kernel_regularizer=regularizers.l2(0.0001)))
nn.add(Dropout(0.3))
nn.add(Dense(1))
nn.compile(optimizer = 'adam', loss = 'mse', metrics = ['mse'])
nn.fit(X_train_scaled,y_train, epochs = 20)

y_pred = nn.predict(X_test_scaled)
print("MSE on test set is",mean_squared_error(y_test, y_pred))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE on test set is 0.481002375660096


## Prediction on test data

In [37]:
rate_pred = nn.predict(X_pre)

### Prediction refinement

In [38]:
rate_pred_r = []

for i in rate_pred:
    i = i.item()
    if i > 5:
        i = 5
    elif i < 1:
        i = 1
    rate_pred_r.append(i)

### Format prediction

In [49]:
predictions = open('rating_prediction.csv', 'w')
i = 0
for l in open('rating_pairs.csv'):
    if l.startswith('userID'):
        #header
        predictions.write(l)
        continue
    u,p = l.strip().split('-')
    predictions.write(u + '-' + p +  ',' + str(rate_pred_r[i]) + '\n')
    i += 1