In [None]:
import numpy as np
import pandas as pd

In [None]:
game_data = pd.read_csv('game_data_final.csv')
game_data = game_data.drop_duplicates()
game_data.reset_index(inplace=True)

In [None]:
def fill_tag_columns(x):
    # Convert list of tags in to individual columns
    ret_df = pd.DataFrame()    
    for row in x:
        list_size = len(row)
        temp_dict = {}
        for i in range(list_size):
            #temp_dict['tag_list'] = [row]
            temp_dict['tag{}'.format(i+1)] = [row[i]]
        if (list_size < 20):
            for y in range(list_size+1,21):
                temp_dict['tag{}'.format(y)] = '' 
        temp_df = pd.DataFrame.from_dict(temp_dict)
        ret_df = temp_df if ret_df.empty else ret_df.append(temp_df, ignore_index=True)
    return ret_df


In [None]:
# Handle instances where no tags are defined for a game
null_tags = game_data.loc[game_data.tag_list.isnull()]
game_data.loc[game_data.tag_list.isnull(), 'tag_list'] = ''
# Convert the string of tags in to a list
tags_list_form = game_data['tag_list'].str.split(',')
game_data['tag_list'] = tags_list_form

In [None]:
# Create new dataframe with tags set as individual columns
temp_df = fill_tag_columns(game_data.tag_list)

In [None]:
# concat new dataframe to the original dataframe
game_data = pd.concat([game_data, temp_df], sort=False, axis=1, ignore_index=False)

In [None]:
# Create a list that contains all of the tags from the dataframe
all_tag_list = game_data.tag_list.agg(sum)
# Convert this list to a set to get unique list of tags
tag_set = set(all_tag_list)

In [None]:
tag_dict = {}
# Create a dictionary containing the list of tags and count of each tag
for x in sorted(tag_set):
    tag_mask = (game_data.filter(regex='tag\d+') == x).any(axis='columns')
    tag_dict[x] = len(game_data.loc[tag_mask].index)

In [None]:
tag_count_list = [(tag_key, tag_dict[tag_key]) for tag_key in sorted(tag_dict, key=tag_dict.get, reverse=True)]
tag_count_list = tag_count_list[1:]
# Pick out the top ten tags. Skip first entry since it contains the empty strings.
top_tags = [x[0] for x in tag_count_list]
top_ten_tags = top_tags[0:10]

In [None]:
# Create an empty dataframe with the index and column names set to top ten tags
tag_df = pd.DataFrame( columns=top_ten_tags, index=top_ten_tags)
total_num_games = max(game_data.index)

In [None]:
# Function to filter rows that contain the a particular tag
def filter_tags(df, tag):
    return (df.filter(regex='tag\d+') == tag).any(axis='columns')

In [None]:
# Fill tag_df dataframe with the percentage occurence for each combination of tags
for x in tag_df.columns:
    for y in tag_df.index:
        tag_df.loc[y,x] = np.sum(filter_tags(game_data, x) & filter_tags(game_data,y))/total_num_games

In [None]:
# Make sure that each entry is a float
tag_df = tag_df.astype(float)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Plot the values of the tag from tag_df without the values in the lower triangle since they are redundant
tag_df_matrix = tag_df.values
print(type(tag_df_matrix))
plot_mask = np.zeros_like(tag_df.values)
plot_mask[np.tril_indices_from(plot_mask, k=-1)] = True
plt.rcParams["figure.figsize"] = [12,8]
with sns.axes_style("white"):
    ax = sns.heatmap(tag_df, mask=plot_mask, square=True, vmin=0.0, vmax=1,cmap="YlGnBu", annot=True)
ax.axes.set_title('Tag Combinations', fontsize=20)
ax.collections[0].colorbar.set_label('Ratio of Games', fontsize=20)

In [None]:
# Bar chart of the number of games tagged with the top ten tags
sns.set(style="whitegrid")
temp_series = pd.Series(dict(tag_count_list))
temp_df = temp_series.to_frame().reset_index()
temp_df.columns = ['Tag','Count']
plt.rcParams["figure.figsize"] = [12,8]
ax = sns.barplot(x='Tag', y='Count',data=temp_df.iloc[0:10])
# Rotate x tick labels slightly to make it easier to read
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Count', fontsize=20)
ax.set_title('Number of Games Per Tag', fontsize=20)

In [None]:
# Divide the data set in two between games that have a price and ones that are Free to Play
game_data_priced = game_data.loc[game_data.price.str.contains('\.')]
game_data_free = game_data.loc[~game_data.price.str.contains('\.')]
# Change type of price to float since there are no strings in column now
game_data_priced.loc[:,'price'] = game_data_priced.price.astype(float)

In [None]:
# Remove items from a column that are more than six standard deviations from the mean
def remove_outliers(df, col):
    return df.loc[np.abs(df.loc[:,col] - df.loc[:,col].mean()) <= 3*(df.loc[:,col].std()), [col]]

# Get the price of all of the games that have a particular tag
def get_tag_prices(df, tag):
    return pd.DataFrame(game_data_priced.loc[filter_tags(game_data_priced, tag),'price'])

In [None]:
# Create a list of dataframes with two columns, price and tag for each of the top ten tags
merged_df_price = pd.DataFrame(columns=['price', 'Tag'])
df_list = []
for cur_tag in top_ten_tags:
    # Make sure to remove outliers from the price data
    temp = remove_outliers(get_tag_prices(game_data_priced, cur_tag), 'price')
    temp['Tag'] = cur_tag
    df_list.append(temp)

# Concat the list of dataframes in to one dataframe for plotting
merged_df_price = pd.concat(df_list, keys=top_tags[0:10], sort=False)
ax = sns.boxplot(data=merged_df_price, x='Tag', y='price', dodge=True)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Price', fontsize =20)
ax.set_title('Box Plots of Prices for Top Ten Tags', fontsize=20)

In [None]:
# Violinn plots of price per tag data
ax = sns.violinplot(data=merged_df_price, x='Tag', y='price', dodge=True)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set_xlabel('Category', fontsize=20)
ax.set_ylabel('Price', fontsize =20)
ax.set_title('Box Plots of Prices for Top Ten Tags', fontsize=20)

In [None]:
# Copy the game data to a smaller dataframe to analyze the description given for each game
tags_for_description = ['description','price']
tags_for_description += ['tag{}'.format(x) for x in range(1,21)]
game_descriptions = game_data.loc[:,tags_for_description]
# Handle empty game descriptions
game_descriptions['description'] = game_descriptions.description.fillna('')

In [None]:
import re
# Convert all characters to lower case
game_descriptions['description'] = game_descriptions.description.str.lower()
# Replace all whitespace with a single space
game_descriptions['description'] = game_descriptions.description.apply(lambda x: re.sub('\s+', ' ', x))
# Remove all entries that don't have a description
game_descriptions = game_descriptions.loc[game_descriptions['description'] != '']
# Get rid of all punctuation
game_descriptions['description'] = game_descriptions.description.apply(lambda x: re.sub('[^\w\s]', '', x))

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
stop = stopwords.words('english')
# Expand stop words to include common game terms that are not descriptive
stop += ['game', 'games','play','player', 'world', 'new', 'one']
# Remove all stop words from description
game_descriptions['description'] = game_descriptions.description.apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [None]:
from wordcloud import WordCloud
# Generate the wordcloud from the description
wc = WordCloud(background_color="white", max_words=2000)
wc.generate(' '.join(game_descriptions['description']))

plt.figure(figsize=(10,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Plot the distribution of prices for the data set
plt.rcParams["figure.figsize"] = [8,12]
f, axes = plt.subplots(2,1, )
plt.subplots_adjust(hspace=0.3)
# Histogram of price distribution
sns.distplot(a=remove_outliers(game_data_priced, 'price').price, norm_hist=False, kde=False, ax=axes[0])
axes[0].set_xlabel('Price', fontsize=20)
axes[0].set_ylabel('Count', fontsize=20)
axes[0].set_title('Histogram of Price Distribution', fontsize=20)

# Violin plot f price distribution
sns.violinplot(data=remove_outliers(game_data_priced, 'price'), x='price', ax=axes[1] )
axes[1].set_xlabel('Price', fontsize=20)
axes[1].set_ylabel('Count', fontsize=20)
axes[1].set_title('Violin Plot of Price Distribution', fontsize=20)