### import necessary toolsets

In [1]:
import json
import pandas as pd
import os
import string
import re # regular expression toolkit
import nltk # Natural language toolkit
from collections import defaultdict
from pandas.io.json import json_normalize
import requests

get data from steam website

In [2]:
api_key='3A629E52AE594CB777253431189F2967' # API key for getting information from Steam

In [3]:
def getownedgames(idnum):
    url='http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?key=3A629E52AE594CB777253431189F2967&steamid={}&format=json'.format(idnum)
    return url

### get data from folder and save file names

In [4]:
folder='E:/Desktop/jupyter/Capstone1-master/DATA/steam_reviews-master/steam_reviews-master/data/'

In [5]:
filenames=os.listdir(folder)

In [6]:
filenames[1]

'Arma_3.jsonlines'

### define function to get data from each file

In [7]:
def get_jsonline(file_name):
    data=[]
    with open(file_name) as f:
        for line in f:
            data.append(json.loads(line))
    return data

### make a first dataframe that will serve as frame for other data

In [8]:
for name in filenames:
    if name.endswith(".jsonlines"):
        data=json_normalize(get_jsonline(folder+name))
        df=pd.DataFrame(data)
        df=df[['rating','review','steam_id_number','total_game_hours','total_game_hours_last_two_weeks','username']]
        df['game_name']=name.split('.')[0]
        if 'review_df' not in locals():
            review_df=df
        else:
            review_df=review_df.append(df, ignore_index=True, sort=False)
    else:
        continue

In [9]:
review_df.head(4)

Unnamed: 0,rating,review,steam_id_number,total_game_hours,total_game_hours_last_two_weeks,username,game_name
0,Recommended,My first game on A3 brought me the most horrif...,thisisthefallout,580.3,58.7,King Pootis,Arma_3
1,Recommended,This is not a game for people who want fast ac...,PeaceFaker,122.2,2.8,Peace,Arma_3
2,Recommended,Oh man. Where to even begin with this one. It ...,TheDanius,731.4,38.2,[13th] Danius,Arma_3
3,Recommended,This is quite possibly the most emotional shoo...,ArtificialApple,291.8,25.2,ArtificialApple,Arma_3


In [10]:
print(list(review_df))
print(len(review_df))

['rating', 'review', 'steam_id_number', 'total_game_hours', 'total_game_hours_last_two_weeks', 'username', 'game_name']
79437


## extract only the review section from each game data and clean them up for analysis

### extract words or bigrams from the reviews

In [11]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
raw_review=review_df['review']

In [13]:
raw_review.isnull().sum()

0

In [14]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')+list(string.punctuation)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
def vect(text):
    cvt=CountVectorizer(stop_words="english", analyzer='word')
    X=cvt.fit_transform(text)
    return X.toarray(), cvt.get_feature_names()

In [17]:
def tdfvect(text):   
    tfidf=TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word', stop_words="english")
    X=tfidf.fit_transform(text)
    return X.toarray(),tfidf.get_feature_names()

In [18]:
working_data=review_df.loc[(review_df['game_name'] == 'Warframe')]

In [19]:
freq,words=vect(working_data['review'])
tdfscore,tdfwords=tdfvect(working_data['review'])

In [20]:
result=list(zip(tdfscore,tdfwords))

In [21]:
print(result[1][0].sum())

220.67022397122508


In [22]:
list(zip(words,freq.sum(axis=0)))

[('00', 6),
 ('000', 31),
 ('00gb', 1),
 ('01', 3),
 ('02', 1),
 ('024', 1),
 ('03', 2),
 ('04', 3),
 ('040', 1),
 ('06', 1),
 ('08', 5),
 ('0_o', 2),
 ('0hrs', 1),
 ('10', 1546),
 ('100', 164),
 ('1000', 35),
 ('10000', 3),
 ('1000000', 1),
 ('1000000000000', 1),
 ('1000hr', 1),
 ('1000m', 1),
 ('1000th', 1),
 ('1000times', 1),
 ('1000x', 1),
 ('100h', 1),
 ('100hrs', 1),
 ('100k', 3),
 ('100mountaindew', 1),
 ('100s', 5),
 ('100th', 1),
 ('100x', 1),
 ('100xp', 1),
 ('103', 1),
 ('104', 1),
 ('1080', 2),
 ('1080p', 1),
 ('10gb', 2),
 ('10h', 3),
 ('10k', 1),
 ('10sec', 2),
 ('10x', 3),
 ('11', 152),
 ('110', 3),
 ('1100', 2),
 ('111', 1),
 ('1120', 1),
 ('116hrs', 1),
 ('12', 142),
 ('120', 5),
 ('1200', 2),
 ('123', 1),
 ('123achievments', 1),
 ('125', 1),
 ('125p', 1),
 ('126', 2),
 ('12645', 2),
 ('127419723921', 1),
 ('1280', 1),
 ('1280x720', 1),
 ('12h', 6),
 ('12hr', 2),
 ('12hrs', 2),
 ('12p', 2),
 ('13', 36),
 ('1300', 2),
 ('132', 2),
 ('1337', 1),
 ('14', 53),
 ('140p', 1)

In [23]:
scoring=pd.DataFrame(tdfscore.sum(axis=0),index=tdfwords,columns=['score'])

In [24]:
scoring.head()

Unnamed: 0,score
00,48.475949
000,203.832669
00gb,9.871084
01,27.533811
02,9.871084


In [25]:
scoring.sort_values(by='score',ascending=False)

Unnamed: 0,score
game,18004.072882
warframe,8182.660993
play,7656.728984
weapons,6372.101036
like,6224.721263
fun,5569.553330
free,5317.709187
just,5306.562150
time,5239.025105
new,4947.858005


In [26]:
scoring_file=dict()
for name in review_df['game_name'].unique():
    working_data=review_df.loc[(review_df['game_name']==name)]
    freq,words=vect(working_data['review'])
    tdfscore,tdfwords=tdfvect(working_data['review'])
    result=list(zip(tdfscore,tdfwords))
    list(zip(words,freq.sum(axis=0)))
    scoring=pd.DataFrame(tdfscore.sum(axis=0),index=tdfwords,columns=['score'])
    scoring.sort_values(by='score',ascending=False)
    scoring_file[name]=scoring

In [27]:
names=list(scoring_file.keys())
for k in range(len(names)):
    if "idx1" not in locals():
        idx1=scoring_file[names[k]].index
    idx2=scoring_file[names[k]].index
    idx1=idx2.intersection(idx1)

In [28]:
for name in scoring_file.keys():
    scoring_file[name]=scoring_file[name].drop(idx1).sort_values(by='score',ascending=False)

In [34]:
scoring_file['Football_Manager_2015']

Unnamed: 0,score
football,1399.100629
fm,1219.564409
manager,1207.258037
injuries,736.927391
season,626.404557
league,583.502145
tactics,560.687813
si,491.983148
club,414.442691
beta,398.502465


In [29]:
name='warframe'

In [None]:
print(idx1)

In [31]:
review_df['game_name'].unique()

array(['Arma_3', 'Counter_Strike', 'Counter_Strike_Global_Offensive',
       'Dota_2', 'Football_Manager_2015', 'Garrys_Mod',
       'Grand_Theft_Auto_V', 'Sid_Meiers_Civilization_5',
       'Team_Fortress_2', 'The_Elder_Scrolls_V', 'Warframe'], dtype=object)

In [None]:
print(sorted(scoring, key=lambda x: x[1], reverse=True))

In [None]:
cleaned_data=list()
commonword=list()
bigrams=list()

for num in range(len(raw_review)):
    review=raw_review.iloc[num]
    # split the words into 'tokens'
    cvt = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
    cv.fit(review)
    vector=cv.transform(review)
    cleaned_data.append(vector)

        
    # find the most frequent double word used in each review
    each_bigram=list()
    for wordnum in range(len(clean)-1):
        listitem=(clean[wordnum],clean[wordnum+1])
        each_bigram.append(listitem)
    bigrams.append(each_bigram)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
working_data=review_df.loc[(review_df['game_name'] == 'Arma_3')]
#print(working_data['review'][1])
cv = CountVectorizer(stop_words="english", analyzer='word',ngram_range=(1,1), max_df=1.0, min_df=0.6, max_features=None)
cv.fit(working_data['review'])
vector=cv.transform(working_data['review'])

In [None]:
print(cv.get_feature_names())

In [None]:
working_data=review_df.loc[(review_df['game_name'] == 'Arma_3')]
#print(working_data['review'][1])
cvt = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
txt_fitted=cvt.fit(working_data['review'])
txt_transformed=txt_fitted.transform(working_data['review'])

In [None]:
type(txt_fitted.get_feature_names())

In [None]:
idf=cvt.idf_
scoring=list(zip(txt_fitted.get_feature_names(),idf))
print(sorted(scoring, key=lambda x: x[1], reverse=True))

In [None]:
int(scoring[1][0])

In [None]:
scoring_orig=scoring
print(scoring_orig[1][0])

In [None]:
num_list=list()
num_list2=list()
for i in range(len(scoring)-1):
    try:
        int(scoring[i][0])
    except ValueError:
        num_list.append(scoring[i][0])
        num_list2.append(scoring[i][1])
        continue

In [None]:
A_list=list(zip(num_list,num_list2))

In [None]:
B_list=[item]

In [None]:
raw_review[1]

In [None]:
bigrams[len(cleaned_data)-1]

In [None]:
raw_review[1]

In [None]:
commonword[1]

In [None]:
review_df['cleaned']=cleaned_data
review_df['frequent_words']=commonword
review_df['bigrams']=bigrams

In [None]:
len(review_df)

In [None]:
len(cleaned_data)

### Assign weight according to total playtime of each reviewer

In [None]:
review_weight=list()
for each_game in review_df['game_name'].unique():
    working_data=review_df.loc[(review_df['game_name'] == each_game)]
    review_weight.extend(working_data['total_game_hours']/max(working_data['total_game_hours']))
review_df['review_weight']=review_weight

In [None]:
review_df.head(3)

### Pull out words according to the review weight

In [None]:
review_df['freq_word_values'] = review_df.apply(lambda row: list(row['frequent_words'].values()),axis=1)

In [None]:
review_df['freq_word_keys'] = review_df.apply(lambda row: list(row['frequent_words'].keys()),axis=1)

In [None]:
review_df['freq_word_weighted'] = review_df.apply(lambda row: [ i*row['review_weight']  for i in row['freq_word_values']],axis = 1)

In [None]:
review_df['weighted_paired']=review_df.apply(lambda row: dict(zip(row['freq_word_keys'], row['freq_word_weighted'])),axis = 1)

In [None]:
review_df.tail(3)

In [None]:
review_df['weighted_paired'][1]

In [None]:
key_df=pd.DataFrame(index=review_df['game_name'].unique(),columns=['key_word','key_bigram'])

In [None]:
from collections import Counter
for each_game in review_df['game_name'].unique():
    working_data=review_df.loc[(review_df['game_name'] == each_game)]
    game_word=Counter()
    for pairs in working_data['weighted_paired']:
        game_word.update(pairs)
    key_df['key_word'][each_game]=game_word.most_common()

In [None]:
bigram=defaultdict()
for each_game in review_df['game_name'].unique():
    working_data=review_df.loc[(review_df['game_name'] == each_game)]
    wordfreq[word]+=1

In [None]:
key_df

In [None]:
for each_game in review_df['game_name'].unique():
    working_data=review_df.loc[(review_df['game_name'] == each_game)]
    

In [None]:
key_df['key_word']['Team_Fortress_2'][1:30]

In [None]:
key_df['key_word']['Sid_Meiers_Civilization_5']

In [None]:
len(working_data)

In [None]:
sorted_by_value = sorted(game_word.items(), key=lambda kv: kv[1],reverse=True)

In [None]:
print(sorted_by_value)

In [None]:
from pattern.en import sentiment

In [None]:
sentiment(raw_review[79436]).assessments[1]

In [None]:
sentiment(raw_review[79436])

In [None]:
key_df

In [None]:
from collections import OrderedDict
result=defaultdict(int)
for each_game in review_df['game_name'].unique():
    working_data=review_df.loc[(review_df['game_name'] == 'Warframe')]
    for num in range(len(working_data['ranked_words'])):
        for word in working_data['ranked_words'].iloc[num]:
            result[word[1]]+=word[0]
    frequent_words=OrderedDict(sorted(result.items(), key=lambda t:t[1], reverse=True))
    for (key, value) in enumerate(frequent_words.items()):
        key_df['key_word']['Warframe']=frequent_words[0:100]
    #x=((value*working_data.review_weight,key) for key,value in commonword.items())   
    #ranked.extend(x)

In [None]:
print((key for key, value in frequent_words.items()))

In [None]:
type(key_df['key_word']['Warframe'])

In [None]:
key_df

In [None]:
result = defaultdict(int)

for d in lst:
    result[d['name']] += int(d['value'])

In [None]:
print(sorted(ranked,reverse=True)[1:100])

In [None]:
review_df.head(3)

In [None]:
df.head()

In [None]:
print(weighted_list)

In [None]:
review1=pd.DataFrame(reviewdata['Warframe'])

In [None]:
s=sum(review1['total_game_hours'])

In [None]:
type([review1['total_game_hours']/s])

In [None]:
reviewex=reviewdata['Warframe'][99]['review']
print(reviewex)

In [None]:
cleaned=reviewdata['Warframe'][99]['cleaned review']
print(cleaned)

In [None]:
print(reviewdata['Warframe'][99]['frequent words'])

In [None]:
wordfreq=defaultdict(int)

In [None]:
wordfreq.items()

In [None]:
type(wordfreq)

In [None]:
cleaned_data=list()
commonword=list()
bigrams=list()

for num in range(len(raw_review)):
    review=raw_review.iloc[num]
    # split the words into 'tokens'
    tokens=word_tokenize(review)
    # lowercase every tokens
    lower_tokens=[token.lower() for token in tokens]
    # remove stopwords and punctuation from the words
    words=[word for word in lower_tokens if word not in stop_words]
    clean=[str(word) for word in words if word.isalpha()]
    cleaned_data.append(clean)
    
    # find the most frequent single word used in each review
    wordfreq=defaultdict(int)
    for word in clean:
        wordfreq[word]+=1
    commonword.append(wordfreq)
        
    # find the most frequent double word used in each review
    each_bigram=list()
    for wordnum in range(len(clean)-1):
        listitem=(clean[wordnum],clean[wordnum+1])
        each_bigram.append(listitem)
    bigrams.append(each_bigram)

In [None]:
for each_name in review_df['game_name'].unique():
    working_data=review_df.loc[(review_df['game_name'] == each_name)][['game_name','review']]
    if 'sample_review' not in locals():
        sample_review=pd.DataFrame(columns=['game_name','review'])
        sample_review=working_data.sample(99)
    else:
        sample_review=sample_review.append(working_data.sample(99))

In [None]:
writer=pd.ExcelWriter('E:\Desktop\Sample_Reviews.xlsx')
sample_review.to_excel(writer,'Sheet1')
writer.save()
del sample_review

In [None]:
for each_name in review_df['game_name'].unique():
    working_data=review_df.loc[(review_df['game_name'] == each_name)][['game_name','review']]
    if 'all_review' not in locals():
        all_review=pd.DataFrame(columns=['game_name','review'])
        all_review=working_data[['game_name','review']]
    else:
        all_review=all_review.append(working_data[['game_name','review']])

In [None]:
writer=pd.ExcelWriter('E:\Desktop\all_Reviews.xlsx')
review_df[['game_name','review']].to_excel(writer,'Sheet1')
writer.save()

In [None]:
review_df.sample(frac=0.1,replace=False)