# Sentimental Analysis in game titles

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv('ign.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
0,0,Amazing,LittleBigPlanet PS Vita,/games/littlebigplanet-vita/vita-98907,PlayStation Vita,9.0,Platformer,Y,2012,9,12
1,1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,/games/littlebigplanet-ps-vita-marvel-super-he...,PlayStation Vita,9.0,Platformer,Y,2012,9,12
2,2,Great,Splice: Tree of Life,/games/splice/ipad-141070,iPad,8.5,Puzzle,N,2012,9,12
3,3,Great,NHL 13,/games/nhl-13/xbox-360-128182,Xbox 360,8.5,Sports,N,2012,9,11
4,4,Great,NHL 13,/games/nhl-13/ps3-128181,PlayStation 3,8.5,Sports,N,2012,9,11


In [3]:
data.shape

(18625, 11)

In [4]:
data.score_phrase.value_counts()

Great          4773
Good           4741
Okay           2945
Mediocre       1959
Amazing        1804
Bad            1269
Awful           664
Painful         340
Unbearable       72
Masterpiece      55
Disaster          3
Name: score_phrase, dtype: int64

In [5]:
len(list(data.score_phrase.value_counts()))

11

In [6]:
list(data.score_phrase.value_counts().index)

['Great',
 'Good',
 'Okay',
 'Mediocre',
 'Amazing',
 'Bad',
 'Awful',
 'Painful',
 'Unbearable',
 'Masterpiece',
 'Disaster']

#### convert score_phrase to binary sentiment and add a new column called sentiment

In [7]:
bad_phrase=['Bad','Awful','Unbearable','Disaster','Painful']
data['sentiment']=data['score_phrase'].isin(bad_phrase).map({True:'Negative',False:"Positive"})

In [8]:
# Remove "Disaster"
data = data[data['score_phrase'] != 'Disaster']

##### No. of positive and negative sentiment

In [9]:
data.sentiment.value_counts()

Positive    16277
Negative     2345
Name: sentiment, dtype: int64

In [10]:
data.sentiment.value_counts(normalize=True)

Positive    0.874074
Negative    0.125926
Name: sentiment, dtype: float64

##### check for null values

In [11]:
data.isnull().sum()

Unnamed: 0         0
score_phrase       0
title              0
url                0
platform           0
score              0
genre             36
editors_choice     0
release_year       0
release_month      0
release_day        0
sentiment          0
dtype: int64

#### Fill out null values with empty string

In [12]:
data.fillna(value='', inplace=True)

In [13]:
data_copy = data[ ['sentiment', 'score_phrase', 'title', 'platform', 'genre', 'editors_choice'] ].copy()
data_copy.head(10)

Unnamed: 0,sentiment,score_phrase,title,platform,genre,editors_choice
0,Positive,Amazing,LittleBigPlanet PS Vita,PlayStation Vita,Platformer,Y
1,Positive,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,PlayStation Vita,Platformer,Y
2,Positive,Great,Splice: Tree of Life,iPad,Puzzle,N
3,Positive,Great,NHL 13,Xbox 360,Sports,N
4,Positive,Great,NHL 13,PlayStation 3,Sports,N
5,Positive,Good,Total War Battles: Shogun,Macintosh,Strategy,N
6,Negative,Awful,Double Dragon: Neon,Xbox 360,Fighting,N
7,Positive,Amazing,Guild Wars 2,PC,RPG,Y
8,Negative,Awful,Double Dragon: Neon,PlayStation 3,Fighting,N
9,Positive,Good,Total War Battles: Shogun,PC,Strategy,N


#### create a new column called is_editors_choice

In [14]:
data_copy['is_editors_choice']=data_copy['editors_choice'].map({'Y': 'editors_choice', 'N':''})
data_copy.head()

Unnamed: 0,sentiment,score_phrase,title,platform,genre,editors_choice,is_editors_choice
0,Positive,Amazing,LittleBigPlanet PS Vita,PlayStation Vita,Platformer,Y,editors_choice
1,Positive,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,PlayStation Vita,Platformer,Y,editors_choice
2,Positive,Great,Splice: Tree of Life,iPad,Puzzle,N,
3,Positive,Great,NHL 13,Xbox 360,Sports,N,
4,Positive,Great,NHL 13,PlayStation 3,Sports,N,


#### create a new column called test contains contents of several columns

In [15]:
data_copy['text'] = data_copy['title'] + ' ' + data_copy['platform'] + ' ' + data_copy['genre'] + ' ' + data_copy['is_editors_choice']

In [16]:
data_copy.head()

Unnamed: 0,sentiment,score_phrase,title,platform,genre,editors_choice,is_editors_choice,text
0,Positive,Amazing,LittleBigPlanet PS Vita,PlayStation Vita,Platformer,Y,editors_choice,LittleBigPlanet PS Vita PlayStation Vita Platf...
1,Positive,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,PlayStation Vita,Platformer,Y,editors_choice,LittleBigPlanet PS Vita -- Marvel Super Hero E...
2,Positive,Great,Splice: Tree of Life,iPad,Puzzle,N,,Splice: Tree of Life iPad Puzzle
3,Positive,Great,NHL 13,Xbox 360,Sports,N,,NHL 13 Xbox 360 Sports
4,Positive,Great,NHL 13,PlayStation 3,Sports,N,,NHL 13 PlayStation 3 Sports


In [17]:
data_copy.shape

(18622, 8)

##### Removing punctuation from text

In [19]:
import string

In [22]:
data_copy['text']=data_copy['text'].apply( lambda x : ' '.join(word.strip(string.punctuation) for word in x.split()))

In [23]:
data_copy['text'].head()

0    LittleBigPlanet PS Vita PlayStation Vita Platf...
1    LittleBigPlanet PS Vita  Marvel Super Hero Edi...
2                      Splice Tree of Life iPad Puzzle
3                               NHL 13 Xbox 360 Sports
4                          NHL 13 PlayStation 3 Sports
Name: text, dtype: object

In [24]:
X = data_copy.text
y = data_copy.score_phrase

In [25]:
X.head()

0    LittleBigPlanet PS Vita PlayStation Vita Platf...
1    LittleBigPlanet PS Vita  Marvel Super Hero Edi...
2                      Splice Tree of Life iPad Puzzle
3                               NHL 13 Xbox 360 Sports
4                          NHL 13 PlayStation 3 Sports
Name: text, dtype: object

In [26]:
y.head(10)

0    Amazing
1    Amazing
2      Great
3      Great
4      Great
5       Good
6      Awful
7    Amazing
8      Awful
9       Good
Name: score_phrase, dtype: object

---------

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

## Multinomial Classifier

In [28]:
from sklearn.naive_bayes import MultinomialNB

In [29]:
vect = TfidfVectorizer(stop_words='english', 
                       token_pattern=r'\b\w{2,}\b',
                       min_df=1, max_df=0.1,
                       ngram_range=(1,2))
mnb = MultinomialNB(alpha=2)

mnb_pipeline = make_pipeline(vect, mnb)

In [30]:
# Cross Validation
cv = cross_val_score(mnb_pipeline, X, y, scoring='accuracy', cv=10, n_jobs=-1)
print('\nMultinomialNB Classifier\'s Accuracy: %0.5f\n' % cv.mean())


MultinomialNB Classifier's Accuracy: 0.32355



--------------------

In [31]:
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

------------

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

texts=["dog cat fish","dog cat cat","fish bird", 'bird']
cv = CountVectorizer()
cv_fit=cv.fit_transform(texts)

print(cv.get_feature_names())
print(cv_fit.toarray())

['bird', 'cat', 'dog', 'fish']
[[0 1 1 1]
 [0 2 1 0]
 [1 0 0 1]
 [1 0 0 0]]


In [34]:
cv.vocabulary_

{'bird': 0, 'cat': 1, 'dog': 2, 'fish': 3}

----------

## Create the Vocab

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,1), token_pattern=r'\b\w{1,}\b')

In [36]:
vect.fit(X_train)
vocab = vect.vocabulary_

In [37]:
vocab

{'authority': 610,
 'matters': 4160,
 'crasher': 1644,
 'bode': 955,
 'spectral': 6208,
 'velious': 7137,
 'tortuga': 6815,
 'wedding': 7304,
 '2014': 93,
 'shibuya': 5911,
 'degrees': 1878,
 'flat': 2612,
 'injection': 3443,
 'mytran': 4493,
 'biggest': 842,
 'frame': 2690,
 'nes': 4566,
 'choroq': 1390,
 'castrol': 1268,
 'heritage': 3204,
 'allstar': 373,
 'am': 391,
 'sith': 6019,
 'flicky': 2619,
 'war': 7244,
 'gyruss': 3077,
 'g1': 2775,
 'blowout': 934,
 'nation': 4527,
 'hostile': 3301,
 'konami': 3777,
 'derek': 1901,
 'jackie': 3540,
 'game': 2799,
 'trial': 6903,
 'booga': 985,
 'tatsunoko': 6601,
 'mindhabits': 4288,
 'titanic': 6758,
 'campaigns': 1204,
 'tzar': 7004,
 'volt': 7209,
 'bless': 906,
 'timeless': 6742,
 'shooting': 5937,
 'hoodlum': 3277,
 'squadrons': 6281,
 'fire': 2571,
 'amf': 402,
 'skateboarding': 6029,
 'monopoly': 4373,
 'britain': 1080,
 'hegemonia': 3181,
 'astro': 574,
 'debut': 1851,
 'fables': 2449,
 'gat': 2825,
 'napoleon': 4515,
 'gtr': 3032,

In [38]:
vocab['187']

45

#### converting X_word to X_ids

In [39]:
def convert_X_to_X_word_ids(X):
    return X.apply( lambda x: [vocab[w] for w in [w.lower().strip() for w in x.split()] if w in vocab] )

In [40]:
X_train_word_ids = convert_X_to_X_word_ids(X_train)
X_test_word_ids  = convert_X_to_X_word_ids(X_test)

In [41]:
X_train.head()

16138    Castlevania Harmony of Despair PlayStation 3 A...
5945     Kim Possible 2 Drakken's Demise Game Boy Advan...
11360                   Madden NFL 09 PlayStation 2 Sports
18270                          WWE 2K16 Xbox One Wrestling
12533               The Last Ninja Commodore 64/128 Action
Name: text, dtype: object

In [42]:
X_train_word_ids.head()

16138        [1267, 3134, 4717, 1911, 5074, 149, 251, 269]
5945     [3730, 5126, 77, 1888, 2799, 1025, 266, 5062, ...
11360                     [4037, 4585, 13, 5074, 77, 6257]
18270                        [7447, 136, 7458, 4751, 7439]
12533                        [6674, 3843, 4617, 1533, 251]
Name: text, dtype: object

In [43]:
print('X_train.shape:', X_train.shape)
print('X_train_word_ids.shape:', X_train_word_ids.shape)
print('X_test_word_ids.shape:', X_test_word_ids.shape)

X_train.shape: (16759,)
X_train_word_ids.shape: (16759,)
X_test_word_ids.shape: (1863,)


## Sequence padding

In [44]:
X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=20, value=0)
X_test_padded_seqs  = pad_sequences(X_test_word_ids , maxlen=20, value=0)

In [45]:
print('X_train_padded_seqs.shape:', X_train_padded_seqs.shape)
print('X_test_padded_seqs.shape:', X_test_padded_seqs.shape)

X_train_padded_seqs.shape: (16759, 20)
X_test_padded_seqs.shape: (1863, 20)


In [46]:
pd.DataFrame(X_train_padded_seqs).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0,0,0,0,0,0,0,0,0,0,0,0,1267,3134,4717,1911,5074,149,251,269
1,0,0,0,0,0,0,0,0,0,0,0,3730,5126,77,1888,2799,1025,266,5062,2227
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4037,4585,13,5074,77,6257
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7447,136,7458,4751,7439
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6674,3843,4617,1533,251


In [47]:
pd.DataFrame(X_test_padded_seqs).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7380,5261
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2154,5451,5074,149,5062
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4535,3944,12,5074,149,6257
3,0,0,0,0,0,0,0,0,0,0,0,0,6674,7234,1833,162,1826,4931,269,2329
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,251,935,3500,251


## convert y to vectors

In [48]:
unique_y_labels = list(y_train.value_counts().index)
unique_y_labels

['Great',
 'Good',
 'Okay',
 'Mediocre',
 'Amazing',
 'Bad',
 'Awful',
 'Painful',
 'Unbearable',
 'Masterpiece']

In [49]:
len(unique_y_labels)

10

In [50]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(unique_y_labels)

LabelEncoder()

In [51]:
print('')
for label_id, label_name in zip(le.transform(unique_y_labels), unique_y_labels):
    print('%d: %s' % (label_id, label_name))
print('')


4: Great
3: Good
7: Okay
6: Mediocre
0: Amazing
2: Bad
1: Awful
8: Painful
9: Unbearable
5: Masterpiece



In [52]:
y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]), nb_classes=len(unique_y_labels))
y_test  = to_categorical(y_test.map(lambda x:  le.transform([x])[0]), nb_classes=len(unique_y_labels))

-----------

### Network building using tflearn

In [53]:
size_of_each_vector = X_train_padded_seqs.shape[1]
vocab_size = len(vocab)
no_of_unique_y_labels = len(unique_y_labels)

In [54]:
print('size_of_each_vector:', size_of_each_vector)
print('vocab_size:', vocab_size)
print('no_of_unique_y_labels:', no_of_unique_y_labels)

size_of_each_vector: 20
vocab_size: 7596
no_of_unique_y_labels: 10


In [55]:
#sgd = tflearn.SGD(learning_rate=1e-4, lr_decay=0.96, decay_step=1000)

net = tflearn.input_data([None, size_of_each_vector]) # The first element is the "batch size" which we set to "None"
net = tflearn.embedding(net, input_dim=vocab_size, output_dim=128) # input_dim: vocabulary size
net = tflearn.lstm(net, 128, dropout=0.6) # Set the dropout to 0.6
net = tflearn.fully_connected(net, no_of_unique_y_labels, activation='softmax') # relu or softmax
net = tflearn.regression(net, 
                         optimizer='adam',  # adam or ada or adagrad # sgd
                         learning_rate=1e-4,
                         loss='categorical_crossentropy')
model = tflearn.DNN(net, tensorboard_verbose=0)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
Instructions for updating:
Please switch to tf.summary.merge.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [56]:
model.fit(X_train_padded_seqs, y_train,  validation_set=.2, n_epoch=40,show_metric=True, batch_size=100)

Training Step: 5400  | total loss: [1m[32m1.12844[0m[0m
| Adam | epoch: 040 | loss: 1.12844 - acc: 0.6059 | val_loss: 1.63124 - val_acc: 0.3959 -- iter: 13407/13407
Training Step: 5400  | total loss: [1m[32m1.12844[0m[0m
| Adam | epoch: 040 | loss: 1.12844 - acc: 0.6059 | val_loss: 1.63124 - val_acc: 0.3959 -- iter: 13407/13407
--


In [57]:
y_pred=model.predict(X_test_padded_seqs)

In [58]:
from sklearn import metrics

In [59]:
pred_classes=np.argmax(y_pred,axis=1)
true_classes=np.argmax(y_test,axis=1)

In [60]:
print('\nRNN Classifier\'s Accuracy: %0.5f\n' % metrics.accuracy_score(true_classes, pred_classes))


RNN Classifier's Accuracy: 0.41492



## show some predicted samples

In [61]:
ids_of_titles = range(0,21) # range(X_test.shape[0]) 

for i in ids_of_titles:
    pred_class = np.argmax(model.predict([X_test_padded_seqs[i]]))
    true_class = np.argmax(y_test[i])
    
    print(X_test.values[i])
    print('pred_class:', le.inverse_transform(pred_class))
    print('true_class:', le.inverse_transform(true_class))
    print('')

Amy's Jigsaw Scrapbook Wireless Puzzle
pred_class: Good
true_class: Good

DuckTales Remastered PlayStation 3 Platformer
pred_class: Okay
true_class: Good

NBA Live 08 PlayStation 3 Sports
pred_class: Okay
true_class: Okay

The Walking Dead 400 Days PC Adventure Episodic
pred_class: Great
true_class: Great

Action Blox iPhone Action
pred_class: Good
true_class: Okay

Kane  Lynch Dead Men Xbox 360 Action
pred_class: Good
true_class: Good

Life is Strange  Episode 4 Dark Room Xbox One Adventure
pred_class: Great
true_class: Okay

Genma Onimusha Xbox Action Adventure
pred_class: Good
true_class: Great

Klonoa 2 Dream Champ Tournament Game Boy Advance Platformer
pred_class: Great
true_class: Great

The Walking Dead A Telltale Game Series  Season Two PlayStation Vita Adventure
pred_class: Great
true_class: Great

Dead Star PlayStation 4 Strategy
pred_class: Great
true_class: Good

Gods vs Humans Wii Strategy
pred_class: Bad
true_class: Bad

Worms Revolution PlayStation 3 Strategy
pred_class: