In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from category_encoders.woe import WOEEncoder
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
#nltk.download('stopwords')
from sklearn.decomposition import TruncatedSVD

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
%matplotlib inline

### Data Import

In [2]:
train_a = pd.read_csv('./data/train/train.csv')
train_b = pd.read_csv('./data/train/game_overview.csv')
test = pd.read_csv('./data/test/test.csv')
print("Data Import Done :",train_a.shape)
print("Data Import Done :",train_b.shape)
print("Data Import Done :",test.shape)

Data Import Done : (17494, 5)
Data Import Done : (64, 5)
Data Import Done : (8045, 4)


In [3]:
train_a.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1


In [4]:
train_b.head()

Unnamed: 0,title,developer,publisher,tags,overview
0,Spooky's Jump Scare Mansion,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
1,Sakura Clicker,Winged Cloud,Winged Cloud,"['Nudity', 'Anime', 'Free to Play', 'Mature', ...",The latest entry in the Sakura series is more ...
2,WARMODE,WARTEAM,WARTEAM,"['Early Access', 'Free to Play', 'FPS', 'Multi...",Free to play shooter about the confrontation o...
3,Fractured Space,Edge Case Games Ltd.,Edge Case Games Ltd.,"['Space', 'Multiplayer', 'Free to Play', 'PvP'...",Take the helm of a gigantic capital ship and g...
4,Counter-Strike: Global Offensive,"Valve, Hidden Path Entertainment",Valve,"['FPS', 'Multiplayer', 'Shooter', 'Action', 'T...",Counter-Strike: Global Offensive (CS: GO) expa...


In [5]:
test.head()

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,"Nice graphics, new maps, weapons and models. B..."
1,1604,Counter-Strike: Global Offensive,2018.0,I would not recommend getting into this at its...
2,1605,Counter-Strike: Global Offensive,2018.0,Edit 11/12/18I have tried playing CS:GO recent...
3,1606,Counter-Strike: Global Offensive,2015.0,The game is great. But the community is the wo...
4,1607,Counter-Strike: Global Offensive,2015.0,I thank TrulyRazor for buying this for me a lo...


In [6]:
train = train_a.merge(train_b, on=['title'], how='left')
print("Data Merged :", train.shape)
train.head()

Data Merged : (17494, 9)


Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...


In [7]:
train.isna().sum() #so, year is missing in 178 reviews.

review_id            0
title                0
year               178
user_review          0
user_suggestion      0
developer            0
publisher            0
tags                 0
overview             0
dtype: int64

In [8]:
train[train['year'].isna()]['title'].value_counts() #missing year will be assigned the median year for its title

Ring of Elysium                                       24
War Thunder                                           17
Bless Online                                          12
Realm Royale                                          11
Eternal Card Game                                     11
Cuisine Royale                                         9
Heroes & Generals                                      8
The Elder Scrolls®: Legends™                           7
Yu-Gi-Oh! Duel Links                                   7
Bloons TD Battles                                      7
Creativerse                                            6
DCS World Steam Edition                                6
Team Fortress 2                                        5
World of Tanks Blitz                                   5
Dota 2                                                 5
AdventureQuest 3D                                      5
Tactical Monsters Rumble Arena                         4
Path of Exile                  

In [9]:
title_with_missingyear = list(train[train['year'].isna()]['title'].unique())
MVI_data = train[train['title'].isin(title_with_missingyear)].groupby(['title'])['year'].median()
MVI_data.shape #series with title as index and its median year as value

(32,)

In [231]:
MVI_data.to_pickle('MVI_data.pkl')

In [10]:
# train_v2 is train data with MVI for year
train_v2 = train.copy()
train_v2['year']=np.where(train['year'].isna(), MVI_data[train['title']], train['year'])
train_v2.isna().sum()

review_id          0
title              0
year               0
user_review        0
user_suggestion    0
developer          0
publisher          0
tags               0
overview           0
dtype: int64

### Review Text Preprocessing

In [11]:
# Adding new features(to make sure we don't lose any important information while cleaning review)
train_v2['link_cnt'] = train_v2['user_review'].apply(lambda x : len(re.findall(r'https?:\/\/.*', x.lower())))
train_v2['word_cnt'] = train_v2['user_review'].apply(lambda x : len(nltk.word_tokenize(x)))
train_v2['sent_cnt'] = train_v2['user_review'].apply(lambda x : len(nltk.sent_tokenize(x)))

In [12]:
def decontract(phrase):
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [13]:
def clean(phrase):
    phrase = phrase.lower()
    phrase = decontract(phrase)
    phrase = re.sub(r'https?:\/\/.*', ' ', phrase)
    phrase = re.sub(r'\W', ' ', phrase)
    phrase = re.sub(r'\d', ' ', phrase)
    phrase = re.sub(r'\s+', ' ', phrase)
    phrase = re.sub(r'\s+$', ' ', phrase)
    phrase = re.sub(r'^\s+', ' ', phrase)
    return phrase

In [14]:
def lemmzify(sentn):
    lemmz = WordNetLemmatizer()
    lem_words=[]
    for word in nltk.word_tokenize(sentn):
        if word not in stopwords.words('english'):
            lem_words.append(lemmz.lemmatize(word))
    lem_text = ' '.join(lem_words)
    return lem_text
# autocorrect and word negation hasn't been handled yet.

In [15]:
train_v2['lemmz_review'] = train_v2['user_review'].apply(lambda x : lemmzify(clean(x)))

In [24]:
train_v2.to_pickle('./train_v2.pkl')
train_v2.head(3)

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview,link_cnt,word_cnt,sent_cnt,lemmz_review
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...,0,152,5,scared hearing creepy voice pause moment write...
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...,0,61,3,best game better sam pepper youtube account ne...
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...,0,84,5,littly iffy control know play easy master made...


In [25]:
# Next Steps :
# encoding the titles, developer; tags flags; overview lemmatization(and if possible lsa, get 4 hidden concepts).

In [216]:
# encoding the titles and developer

encode_woe = WOEEncoder()
encode_woe.fit(train_v2[['title', 'developer']],train_v2['user_suggestion'])
train_woe = encode_woe.transform(train_v2[['title', 'developer']])
train_woe.rename(columns={'title':'title_woe', 'developer':'developer_woe'}, inplace=True)
print(train_woe.head())
with open('encode_woe.pickle','wb') as f:
    pickle.dump(encode_woe,f)

   title_woe  developer_woe
0   1.755931       1.755931
1   1.755931       1.755931
2   1.755931       1.755931
3   1.755931       1.755931
4   1.755931       1.755931


In [108]:
tags = train_v2['tags'].copy()
all_tags =set()
for t in tags:
    t = re.sub(r'[\'\[\]]', '', t)
    t = re.sub(r'\s+', '', t).split(',')
    all_tags = all_tags.union(set(t))
len(all_tags) # 140 tags in total, so we better lemmatize tags as well instead of one-hot.
#Do keep in mind that tags based tfidf features will have high correlation.
#lemmz_tags can be used to categorize games using LSA.

140

In [113]:
train_v3 = train_v2.copy()
train_v3['lemmz_tags'] = train_v3['tags'].apply(lambda x : lemmzify(clean(x)))
train_v3['lemmz_overview'] = train_v3['overview'].apply(lambda x : lemmzify(clean(x)))
train_v3.to_pickle('./train_v3.pkl')
train_v3.head(3)

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview,link_cnt,word_cnt,sent_cnt,lemmz_review,lemmz_tags,lemmz_overview
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...,0,152,5,scared hearing creepy voice pause moment write...,horror free play cute first person isingleplay...,survive room cute terror break cuteness start ...
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...,0,61,3,best game better sam pepper youtube account ne...,horror free play cute first person isingleplay...,survive room cute terror break cuteness start ...
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...,0,84,5,littly iffy control know play easy master made...,horror free play cute first person isingleplay...,survive room cute terror break cuteness start ...


In [145]:
tfidf_review = TfidfVectorizer(min_df=0.03, max_df=0.60, ngram_range=(1,3))
X = tfidf_review.fit_transform(train_v3['lemmz_review']).toarray()
print(X.shape) # 365 features

with open('tfidf_review.pickle','wb') as f:
    pickle.dump(tfidf_review,f)

(17494, 365)


In [146]:
review_tfidf = pd.DataFrame(X, columns=tfidf_review.get_feature_names(), index=train_v3.index)
review_tfidf.head(3)

Unnamed: 0,ability,able,absolutely,access,access reviewi,access reviewthis,access reviewthis game,account,action,actually,...,world,worse,worst,worth,would,would recommend,wrong,year,yes,yet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31694,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284827
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [207]:
tfidf_tags = TfidfVectorizer(min_df=200,max_df=0.60)
X = tfidf_tags.fit_transform(train_v3['lemmz_tags']).toarray()
print(X.shape) # 143 features

with open('tfidf_tags.pickle','wb') as f:
    pickle.dump(tfidf_tags,f)

(17494, 143)


In [208]:
lsa_tags = TruncatedSVD(n_components=30, n_iter=100) #so,it will search for existence of 30 different concepts
lsa_tags.fit(X) #SVD done; creates the Vtranspose matrix of SVD(U*S*Vtr.) for tfidf matrix
print(lsa_tags.explained_variance_ratio_.sum()) #94% explained in 30 concepts.

with open('lsa_tags.pickle','wb') as f:
    pickle.dump(lsa_tags,f)

0.9412416313790785


In [209]:
tags_lsa = pd.DataFrame(lsa_tags.transform(X), columns=["tag_grp"+str(i) for i in range(1,31)], index=train_v3.index)
tags_lsa.head(3)

Unnamed: 0,tag_grp1,tag_grp2,tag_grp3,tag_grp4,tag_grp5,tag_grp6,tag_grp7,tag_grp8,tag_grp9,tag_grp10,...,tag_grp21,tag_grp22,tag_grp23,tag_grp24,tag_grp25,tag_grp26,tag_grp27,tag_grp28,tag_grp29,tag_grp30
0,0.281143,-0.00081,-0.073856,-0.183002,-0.125792,-0.227885,0.565898,-0.447452,-0.169335,0.063474,...,0.001835,0.067325,0.113722,0.061869,-0.060336,0.024091,-0.035651,0.200605,0.003918,0.194179
1,0.281143,-0.00081,-0.073856,-0.183002,-0.125792,-0.227885,0.565898,-0.447452,-0.169335,0.063474,...,0.001835,0.067325,0.113722,0.061869,-0.060336,0.024091,-0.035651,0.200605,0.003918,0.194179
2,0.281143,-0.00081,-0.073856,-0.183002,-0.125792,-0.227885,0.565898,-0.447452,-0.169335,0.063474,...,0.001835,0.067325,0.113722,0.061869,-0.060336,0.024091,-0.035651,0.200605,0.003918,0.194179


In [210]:
tfidf_overview = TfidfVectorizer(min_df=0.05,max_df=0.60, ngram_range=(1,2))
X = tfidf_overview.fit_transform(train_v3['lemmz_overview']).toarray()
print(X.shape) # 741 features

with open('tfidf_overview.pickle','wb') as f:
    pickle.dump(tfidf_overview,f)

(17494, 741)


In [211]:
lsa_overview = TruncatedSVD(n_components=30, n_iter=100) #so,it will search for existence of 30 different concepts
lsa_overview.fit(X) #SVD done; creates the Vtranspose matrix of SVD(U*S*Vtr.) for tfidf matrix
print(lsa_overview.explained_variance_ratio_.sum()) #94% explained in 30 concepts.

with open('lsa_overview.pickle','wb') as f:
    pickle.dump(lsa_overview,f)

0.9041102578203466


In [213]:
overview_lsa = pd.DataFrame(lsa_overview.transform(X), columns=["overview_grp"+str(i) for i in range(1,31)], index=train_v3.index)
overview_lsa.head(3)

Unnamed: 0,overview_grp1,overview_grp2,overview_grp3,overview_grp4,overview_grp5,overview_grp6,overview_grp7,overview_grp8,overview_grp9,overview_grp10,...,overview_grp21,overview_grp22,overview_grp23,overview_grp24,overview_grp25,overview_grp26,overview_grp27,overview_grp28,overview_grp29,overview_grp30
0,0.091323,-0.006355,-0.02203,0.103699,-0.031638,0.029281,-0.119266,0.067962,0.12063,0.08327,...,0.502685,0.126231,-0.355397,-0.16243,-0.130749,-0.157616,0.11516,0.070342,-0.0376,0.073285
1,0.091323,-0.006355,-0.02203,0.103699,-0.031638,0.029281,-0.119266,0.067962,0.12063,0.08327,...,0.502685,0.126231,-0.355397,-0.16243,-0.130749,-0.157616,0.11516,0.070342,-0.0376,0.073285
2,0.091323,-0.006355,-0.02203,0.103699,-0.031638,0.029281,-0.119266,0.067962,0.12063,0.08327,...,0.502685,0.126231,-0.355397,-0.16243,-0.130749,-0.157616,0.11516,0.070342,-0.0376,0.073285


### Preparing the final train-test datasets

In [229]:
train_final = pd.concat([train_v3, train_woe, review_tfidf, tags_lsa, overview_lsa], axis=1)
train_final.to_pickle('train_final.pkl')

In [235]:
test = test.merge(train_b, on=['title'], how='left')
test_v2 = test.copy()

# MVI for year
test_v2['year']=np.where(test_v2['year'].isna(), MVI_data[test_v2['title']], test_v2['year'])

# Adding new features
test_v2['link_cnt'] = test_v2['user_review'].apply(lambda x : len(re.findall(r'https?:\/\/.*', x.lower())))
test_v2['word_cnt'] = test_v2['user_review'].apply(lambda x : len(nltk.word_tokenize(x)))
test_v2['sent_cnt'] = test_v2['user_review'].apply(lambda x : len(nltk.sent_tokenize(x)))

# WOE encoding
test_woe = encode_woe.transform(test_v2[['title', 'developer']])
test_woe.rename(columns={'title':'title_woe', 'developer':'developer_woe'}, inplace=True)
test_v2 = pd.concat([test_v2, test_woe], axis=1)

# Lemmatization
test_v2['lemmz_review'] = test_v2['user_review'].apply(lambda x : lemmzify(clean(x)))
test_v2['lemmz_tags'] = test_v2['tags'].apply(lambda x : lemmzify(clean(x)))
test_v2['lemmz_overview'] = test_v2['overview'].apply(lambda x : lemmzify(clean(x)))

test_v2.to_pickle('test_v2.pkl')
# a note to self : new developer categories showed up a lot, so its woe won't be of any help at all.
# forgot to remove lone characters in sentences before lemmatization. Must have affected tfidf and lsa.

In [248]:
# TfIdf based features for review
X = tfidf_review.transform(test_v2['lemmz_review']).toarray()
review_tfidf = pd.DataFrame(X, columns=tfidf_review.get_feature_names(), index=test_v2.index)

# LSA based features for tags
X = tfidf_tags.transform(test_v2['lemmz_tags']).toarray()
tags_lsa = pd.DataFrame(lsa_tags.transform(X), columns=["tag_grp"+str(i) for i in range(1,31)], index=test_v2.index)

# LSA based features for overview
X = tfidf_overview.transform(test_v2['lemmz_overview']).toarray()
overview_lsa = pd.DataFrame(lsa_overview.transform(X), columns=["overview_grp"+str(i) for i in range(1,31)], index=test_v2.index)

# Final test data
test_final = pd.concat([test_v2, review_tfidf, tags_lsa, overview_lsa], axis=1)
test_final.to_pickle('test_final.pkl')

### Random Forest Classifier

In [2]:
train = pd.read_pickle('train_final.pkl')
test = pd.read_pickle('test_final.pkl')
print(train.shape, test.shape)
drop_features=['title', 'user_review', 'developer', 'publisher', 'tags', 'overview', 'lemmz_review', 'lemmz_tags', 'lemmz_overview']
train.drop(drop_features, axis=1, inplace=True)
test.drop(drop_features, axis=1, inplace=True)
print(train.shape, test.shape)

(17494, 442) (8045, 441)
(17494, 432) (8045, 431)


In [3]:
x_train, x_val, y_train, y_val = train_test_split(train.drop(['user_suggestion'], axis=1).iloc[:,1:],
                                                  train['user_suggestion'], train_size=0.75)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(13120, 430) (4374, 430) (13120,) (4374,)


In [4]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=0.05, verbose=1, max_features='sqrt')
model.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    3.8s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=0.05,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=1, warm_start=False)

In [7]:
pred = model.predict(x_val)
accuracy_score(y_val, pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished


0.7501143118427069

In [8]:
# Generating Submissions
pred = model.predict(test.iloc[:,1:])

submission05 = pd.DataFrame({'review_id' : list(test['review_id']),
                            'user_suggestion' : list(pred)})
submission05.to_csv('submission05.csv', index=False)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').