In [18]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 


In [19]:
%matplotlib inline

In [20]:
train  = pd.read_csv('train.csv') 
test = pd.read_csv('test.csv')

In [21]:
train.shape

(17494, 5)

In [22]:
test.shape

(8045, 4)

In [23]:
train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1


In [24]:
game_overview  = pd.read_csv('game_overview.csv') 

In [25]:
game_overview.head()

Unnamed: 0,title,developer,publisher,tags,overview
0,Spooky's Jump Scare Mansion,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
1,Sakura Clicker,Winged Cloud,Winged Cloud,"['Nudity', 'Anime', 'Free to Play', 'Mature', ...",The latest entry in the Sakura series is more ...
2,WARMODE,WARTEAM,WARTEAM,"['Early Access', 'Free to Play', 'FPS', 'Multi...",Free to play shooter about the confrontation o...
3,Fractured Space,Edge Case Games Ltd.,Edge Case Games Ltd.,"['Space', 'Multiplayer', 'Free to Play', 'PvP'...",Take the helm of a gigantic capital ship and g...
4,Counter-Strike: Global Offensive,"Valve, Hidden Path Entertainment",Valve,"['FPS', 'Multiplayer', 'Shooter', 'Action', 'T...",Counter-Strike: Global Offensive (CS: GO) expa...


In [26]:
combi = train.append(test,ignore_index=True)

In [27]:
combi.shape

(25539, 5)

In [28]:
combi.head()

Unnamed: 0,review_id,title,user_review,user_suggestion,year
0,1,Spooky's Jump Scare Mansion,I'm scared and hearing creepy voices. So I'll...,1.0,2016.0
1,2,Spooky's Jump Scare Mansion,"Best game, more better than Sam Pepper's YouTu...",1.0,2016.0
2,3,Spooky's Jump Scare Mansion,"A littly iffy on the controls, but once you kn...",1.0,2016.0
3,4,Spooky's Jump Scare Mansion,"Great game, fun and colorful and all that.A si...",1.0,2015.0
4,5,Spooky's Jump Scare Mansion,Not many games have the cute tag right next to...,1.0,2015.0


In [29]:
combi['clean_user_review'] = combi['user_review'].str.replace("[^A-Za-z0-9]"," ")

In [30]:
combi.head()

Unnamed: 0,review_id,title,user_review,user_suggestion,year,clean_user_review
0,1,Spooky's Jump Scare Mansion,I'm scared and hearing creepy voices. So I'll...,1.0,2016.0,I m scared and hearing creepy voices So I ll...
1,2,Spooky's Jump Scare Mansion,"Best game, more better than Sam Pepper's YouTu...",1.0,2016.0,Best game more better than Sam Pepper s YouTu...
2,3,Spooky's Jump Scare Mansion,"A littly iffy on the controls, but once you kn...",1.0,2016.0,A littly iffy on the controls but once you kn...
3,4,Spooky's Jump Scare Mansion,"Great game, fun and colorful and all that.A si...",1.0,2015.0,Great game fun and colorful and all that A si...
4,5,Spooky's Jump Scare Mansion,Not many games have the cute tag right next to...,1.0,2015.0,Not many games have the cute tag right next to...


In [31]:
combi['clean_user_review'] = combi['clean_user_review'].apply(lambda x:' '.join([w for w in x.split() if len(w)>3]))
combi.head()

Unnamed: 0,review_id,title,user_review,user_suggestion,year,clean_user_review
0,1,Spooky's Jump Scare Mansion,I'm scared and hearing creepy voices. So I'll...,1.0,2016.0,scared hearing creepy voices pause moment writ...
1,2,Spooky's Jump Scare Mansion,"Best game, more better than Sam Pepper's YouTu...",1.0,2016.0,Best game more better than Pepper YouTube acco...
2,3,Spooky's Jump Scare Mansion,"A littly iffy on the controls, but once you kn...",1.0,2016.0,littly iffy controls once know play very easy ...
3,4,Spooky's Jump Scare Mansion,"Great game, fun and colorful and all that.A si...",1.0,2015.0,Great game colorful that side note though When...
4,5,Spooky's Jump Scare Mansion,Not many games have the cute tag right next to...,1.0,2015.0,many games have cute right next horror Steam f...


In [32]:
tokenized_tweet = combi['clean_user_review'].apply(lambda x: x.split())

In [33]:
from nltk.stem.porter import * 
stemmer = PorterStemmer() 

In [34]:
tokenized_tweet

0        [scared, hearing, creepy, voices, pause, momen...
1        [Best, game, more, better, than, Pepper, YouTu...
2        [littly, iffy, controls, once, know, play, ver...
3        [Great, game, colorful, that, side, note, thou...
4        [many, games, have, cute, right, next, horror,...
                               ...                        
25534    [Early, Access, ReviewGuns, Positive, Good, Id...
25535    [After, review, SPEND, MONEY, would, seam, bet...
25536    [Pros, GameplayCons, Micro, transactions, Does...
25537    [Actualy, saucy, definetly, suggest, players, ...
25538    [people, that, play, minutes, then, that, game...
Name: clean_user_review, Length: 25539, dtype: object

In [35]:
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])

In [36]:
tokenized_tweet

0        [scare, hear, creepi, voic, paus, moment, writ...
1        [best, game, more, better, than, pepper, youtu...
2        [littli, iffi, control, onc, know, play, veri,...
3        [great, game, color, that, side, note, though,...
4        [mani, game, have, cute, right, next, horror, ...
                               ...                        
25534    [earli, access, reviewgun, posit, good, idea, ...
25535    [after, review, spend, money, would, seam, bet...
25536    [pro, gameplaycon, micro, transact, doesn, rea...
25537    [actuali, sauci, definetli, suggest, player, l...
25538    [peopl, that, play, minut, then, that, game, o...
Name: clean_user_review, Length: 25539, dtype: object

In [37]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])    
combi['clean_user_review'] = tokenized_tweet

In [38]:
combi.head()

Unnamed: 0,review_id,title,user_review,user_suggestion,year,clean_user_review
0,1,Spooky's Jump Scare Mansion,I'm scared and hearing creepy voices. So I'll...,1.0,2016.0,scare hear creepi voic paus moment write revie...
1,2,Spooky's Jump Scare Mansion,"Best game, more better than Sam Pepper's YouTu...",1.0,2016.0,best game more better than pepper youtub accou...
2,3,Spooky's Jump Scare Mansion,"A littly iffy on the controls, but once you kn...",1.0,2016.0,littli iffi control onc know play veri easi ma...
3,4,Spooky's Jump Scare Mansion,"Great game, fun and colorful and all that.A si...",1.0,2015.0,great game color that side note though when ge...
4,5,Spooky's Jump Scare Mansion,Not many games have the cute tag right next to...,1.0,2015.0,mani game have cute right next horror steam fi...


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [40]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

In [41]:
tfidf = tfidf_vectorizer.fit_transform(combi['clean_user_review'])

In [42]:
tfidf.shape

(25539, 1000)

In [43]:
tokenized_review = combi['clean_user_review'].apply(lambda x: x.split())

In [48]:
tokenized_review

0        [scare, hear, creepi, voic, paus, moment, writ...
1        [best, game, more, better, than, pepper, youtu...
2        [littli, iffi, control, onc, know, play, veri,...
3        [great, game, color, that, side, note, though,...
4        [mani, game, have, cute, right, next, horror, ...
                               ...                        
25534    [earli, access, reviewgun, posit, good, idea, ...
25535    [after, review, spend, money, would, seam, bet...
25536    [pro, gameplaycon, micro, transact, doesn, rea...
25537    [actuali, sauci, definetli, suggest, player, l...
25538    [peopl, that, play, minut, then, that, game, o...
Name: clean_user_review, Length: 25539, dtype: object

In [45]:
import gensim

In [46]:
model_w2v = gensim.models.Word2Vec(
            tokenized_review,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cores
            seed = 34) 

In [47]:
model_w2v.train(tokenized_tweet, total_examples= len(combi['clean_user_review']), epochs=20)

(0, 244260180)

In [51]:
model_w2v.most_similar(positive='horror')

  """Entry point for launching an IPython kernel.


[('parodi', 0.7386826276779175),
 ('throwback', 0.7331022024154663),
 ('sidescrol', 0.7193974256515503),
 ('jumpscar', 0.710816502571106),
 ('spiritu', 0.7016054391860962),
 ('successor', 0.6996181011199951),
 ('grim', 0.6948860287666321),
 ('httyd', 0.6941152215003967),
 ('trekki', 0.6922910213470459),
 ('franchis', 0.6890436410903931)]

In [89]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary 
                        continue
    if(count != 0):
        vec /= count
    return vec

In [91]:
wordvec_arrays = np.zeros((len(tokenized_review), 200)) 
for i in range(len(tokenized_review)):
    wordvec_arrays[i,:] = word_vector(tokenized_review[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays) 
    wordvec_df.shape

  


In [54]:
from tqdm import tqdm

In [55]:
tqdm.pandas(desc="progress-bar")

  from pandas import Panel


In [56]:
from gensim.models.doc2vec import LabeledSentence

In [57]:
def add_label(rev):
    output = []
    for i, s in zip(rev.index, rev):
        output.append(LabeledSentence(s, ["review_" + str(i)]))
    return output

In [62]:
labeled_revs = add_label(tokenized_review)

  after removing the cwd from sys.path.


In [63]:
labeled_revs[:2]

[LabeledSentence(words=['scare', 'hear', 'creepi', 'voic', 'paus', 'moment', 'write', 'review', 'while', 'wait', 'heart', 'beat', 'return', 'atleast', 'somewhat', 'calmer', 'time', 'thi', 'game', 'ador', 'creepi', 'like', 'happi', 'tree', 'friend', 'with', 'graphic', 'sceme', 'childhood', 'more', 'bubbl', 'clean', 'hello', '1990', 'what', 'charact', 'there', 'that', 'isnot', 'tri', 'kill', 'were', 'likabl', 'noob', 'thing', 'though', 'such', 'look', 'class', 'room', 'full', 'ghost', 'from', 'dead', 'children', 'let', 'shine', 'flashlight', 'them', 'stand', 'there', 'stare', 'them', 'creepi', 'music', 'turn', 'around', 'what', 'chase', 'never', 'befor', 'game', 'have', 'been', 'thi', 'afraid', 'find', 'lock', 'door'], tags=['review_0']),
 LabeledSentence(words=['best', 'game', 'more', 'better', 'than', 'pepper', 'youtub', 'account', '10what', 'need', 'play', 'computersom', 'extra', 'pant', 'pro', 'scari', 'hell', 'adventur', 'spooki', 'forgot', 'mention', 'that', 'scari', 'hell', 'more'

In [61]:
model_d2v = gensim.models.Doc2Vec(dm=1, # dm = 1 for ‘distributed memory’ model
                                  dm_mean=1, # dm = 1 for using mean of the context word vectors                                  size=200, # no. of desired features                                  
                                  window=5, # width of the context window                                  
                                  negative=7, # if > 0 then negative sampling will be used                                 
                                  min_count=5, # Ignores all words with total frequency lower than 2.                                  
                                  workers=3, # no. of cores                                  
                                  alpha=0.1, # learning rate                                  
                                  seed = 23) 

In [64]:
model_d2v.build_vocab([i for i in tqdm(labeled_revs)])

100%|██████████| 25539/25539 [00:00<00:00, 2134001.31it/s]


In [65]:
model_d2v.train(labeled_revs, total_examples= len(combi['clean_user_review']), epochs=15)

## Preparing doc2vec Feature set

In [92]:
#docvec_arrays = np.zeros((len(tokenized_review), 200)) 
#for i in range(len(combi)):
    #docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200)) 

In [None]:
docvec_df = pd.DataFrame(docvec_arrays) 
docvec_df.shape

## Using Logistic Regression

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score

In [70]:
train_tfidf = tfidf[:17494,:]
test_tfidf = tfidf[17494:,:] 

In [71]:
xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(train_tfidf, train['user_suggestion'],random_state=42,test_size=0.3)

In [73]:
lreg = LogisticRegression() 

In [74]:
lreg.fit(xtrain_tfidf, ytrain) 
prediction = lreg.predict_proba(xvalid_tfidf) 
prediction_int = prediction[:,1] >= 0.3 
prediction_int = prediction_int.astype(np.int) 
f1_score(yvalid, prediction_int)



0.8422421265530193

In [75]:
prediction

array([[0.87274525, 0.12725475],
       [0.67839784, 0.32160216],
       [0.03347603, 0.96652397],
       ...,
       [0.34605386, 0.65394614],
       [0.52036705, 0.47963295],
       [0.02274868, 0.97725132]])

In [77]:
test_pred = lreg.predict_proba(test_tfidf) 
test_pred_int = test_pred[:,1] >= 0.3 
test_pred_int = test_pred_int.astype(np.int) 
test['user_suggestion'] = test_pred_int 
submission = test[['review_id','user_suggestion']] 
submission.to_csv('sub_lreg_tfidf.csv', index=False) # writing data to a CSV file

## Using Support Vector machine

In [79]:
from sklearn import svm

In [80]:
svc = svm.SVC(kernel='linear', 
C=1, probability=True).fit(xtrain_tfidf, ytrain) 
prediction = svc.predict_proba(xvalid_tfidf) 
prediction_int = prediction[:,1] >= 0.3 
prediction_int = prediction_int.astype(np.int) 
f1_score(yvalid, prediction_int)

0.8521273430526628

In [None]:
test_pred = svc.predict_proba(test_tfidf) 
test_pred_int = test_pred[:,1] >= 0.3 
test_pred_int = test_pred_int.astype(np.int) 
test['user_suggestion'] = test_pred_int 
submission = test[['review_id','user_suggestion']] 
submission.to_csv('sub_svm_tfidf.csv', index=False)

## Using Random Forest

In [82]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_tfidf, ytrain) 
prediction = rf.predict(xvalid_tfidf) 
f1_score(yvalid, prediction)

0.839073060830383

In [85]:
test_pred = rf.predict(test_tfidf) 
test['user_suggestion'] = test_pred 
submission = test[['review_id','user_suggestion']] 
submission.to_csv('sub_rf_tfidf.csv', index=False)