# Heroes of Pymoli Data Analysis

+ Only 1 of the 6 most popular items appear in the list of the 18 lowest priced items/items prices in the bottom 10% of item prices.  (See Popular Items and Other Analysis - Lowest Priced)
+ Males not only the make up over 80% of the players of this game, the are also responsible for over 80% of the revenue.  (See Gender Demographics and Other Analysis - Gender Purcahse Totals)
+ Out of the 573 players, each player has spend under 20 dollars on items.  
+ The 20-24 age bracket spends has generated more revenue than any other bracket, but the 25-29 age group, on average, purchases more expensive items.  
+ The Retribution Axe not only appears in the most popular item list, it is priced almost 2 dollars more than other popular items on that list and is the item that has generated the most revenue. 
+ Other than the Retribution Axe, the most popular items list generates items that are priced below the average purcahse price.  


In [57]:
#Dependencies and file read
import pandas as pd
import numpy as np
import os

file = os.path.join('Resources', 'purchase_data.json')

pur_data = pd.read_json(file)

#view the data
#pur_data.head()

## Player Count

In [58]:
#Find player count by finding unique screen names and finding the length of that list
player_count = len(pur_data['SN'].unique())

# DataFrame creation for player count
players_df = pd.DataFrame([{'Total Players': player_count}])
#gets rid of number index and resets to Total Players 
players_df.set_index('Total Players', inplace = True)
players_df

573


## Purchasing Analysis (Total)

In [64]:
#code for inspecting data
#pur_data['Item ID'].value_counts()
#unique_items = pd.DataFrame(pur_data['Item ID'].unique())
#len(unique_items)

#creates a df but only keeping last occurance of Item ID
no_dup_items = pur_data.drop_duplicates(['Item ID'], keep = 'last')
#counts items by unique ID
total_unique = len(no_dup_items)
#finds the number of total purchases by counting occurances of price
total_pur = pur_data['Price'].count()
#calculates total revenue for table by summing occurance of price and below calc
total_rev = round(pur_data['Price'].sum(),2)
#calculates total_rev
avg_price = round(total_rev/total_pur, 2)

#creates Purchase Analysis DataFrame
pur_analysis = pd.DataFrame([{
    
    "Number of Unique Items": total_unique,
    'Average Purchase Price': avg_price,
    'Total Purchases': total_pur,
    'Total Revenue': total_rev
}])

#format Purchases Analysis Table
pur_analysis.style.format({'Average Purchase Price': '${:.2f}', 'Total Revenue': '${:,.2f}'})


Unnamed: 0,Average Purchase Price,Number of Unique Items,Total Purchases,Total Revenue
0,$2.93,183,780,"$2,286.33"


## Gender Demographics

In [40]:
# Gender Demographics

# Percentage and Count of Male Players
# Percentage and Count of Female Players
# Percentage and Count of Other / Non-Disclosed

#creates df of unique player names by only keeping the last occurance
no_dup_players = pur_data.drop_duplicates(['SN'], keep ='last')

#counts gender values from the df with no duplicate screen names
gender_counts = no_dup_players['Gender'].value_counts().reset_index()
#adds column for % of players using player count from first table and gender_count 
#column which is a count from line above
gender_counts['% of Players'] = gender_counts['Gender']/player_count * 100
#renames columns
gender_counts.rename(columns = {'index': 'Gender', 'Gender': '# of Players'}, inplace = True)
#sets index as Gender for aesthetics 
gender_counts.set_index(['Gender'], inplace = True)
#just checking percents sum to 100%
#gender_counts['% of Players'].sum()
#formats table
gender_counts.style.format({"% of Players": "{:.1f}%"})

Unnamed: 0_level_0,# of Players,% of Players
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,465,81.2%
Female,100,17.5%
Other / Non-Disclosed,8,1.4%


## Purchasing Analysis by Gender

In [60]:
# Purchasing Analysis (Gender)

# The below each broken by gender
# Purchase Count
# Average Purchase Price
# Total Purchase Value
# Normalized Totals

# counts purchases by gender
pur_count_by_gen = pd.DataFrame(pur_data.groupby('Gender')['Gender'].count())
# sums price by gender
total_pur_by_gen = pd.DataFrame(pur_data.groupby('Gender')['Price'].sum())
#merges the two data frames from above
pur_analysis_gen = pd.merge(pur_count_by_gen, total_pur_by_gen, left_index = True, right_index = True)
#renames columns
pur_analysis_gen.rename(columns = {'Gender': '# of Purchases', 'Price':'Total Purchase Value'}, inplace=True)
#adds column for average purchase price by gender by dividing total purcahse value by gender by # of purchases by gender
pur_analysis_gen['Average Purchase Price'] = pur_analysis_gen['Total Purchase Value']/pur_analysis_gen['# of Purchases']
#merges gender counts from above table (excluding dup SNs) into current df 
pur_analysis_gen = pur_analysis_gen.merge(gender_counts, left_index = True, right_index = True)
# calculates and adds normalized total column by dividing total purchase value by unique # of players by genger
pur_analysis_gen['Normalized Totals'] = pur_analysis_gen['Total Purchase Value']/pur_analysis_gen['# of Players']
pur_analysis_gen
#deletes columns not needed for table (# of Players was used for normalized totals while % of players came from gender count table)
del pur_analysis_gen['% of Players']
del pur_analysis_gen['# of Players']
# #resets index for aesthetics 
# # pur_analysis_gen.set_index('Gender', inplace=True)
# #formats table
pur_analysis_gen.style.format({'Total Purchase Value': '${:.2f}', 'Average Purchase Price': '${:.2f}', 'Normalized Totals': '${:.2f}'})

Unnamed: 0_level_0,# of Purchases,Total Purchase Value,Average Purchase Price,Normalized Totals
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,136,$382.91,$2.82,$3.83
Male,633,$1867.68,$2.95,$4.02
Other / Non-Disclosed,11,$35.74,$3.25,$4.47


## Age Demographics

In [42]:
# The below each broken into bins of 4 years (i.e. <10, 10-14, 15-19, etc.)
# Purchase Count
# Average Purchase Price
# Total Purchase Value
# Normalized Totals

#creates a column 'age_bin' based on conditional of age range
pur_data.loc[(pur_data['Age'] < 10), 'age_bin'] = "< 10"
pur_data.loc[(pur_data['Age'] >= 10) & (pur_data['Age'] <= 14), 'age_bin'] = "10 - 14"
pur_data.loc[(pur_data['Age'] >= 15) & (pur_data['Age'] <= 19), 'age_bin'] = "15 - 19"
pur_data.loc[(pur_data['Age'] >= 20) & (pur_data['Age'] <= 24), 'age_bin'] = "20 - 24"
pur_data.loc[(pur_data['Age'] >= 25) & (pur_data['Age'] <= 29), 'age_bin'] = "25 - 29"
pur_data.loc[(pur_data['Age'] >= 30) & (pur_data['Age'] <= 34), 'age_bin'] = "30 - 34"
pur_data.loc[(pur_data['Age'] >= 35) & (pur_data['Age'] <= 39), 'age_bin'] = "35 - 39"
pur_data.loc[(pur_data['Age'] >= 40), 'age_bin'] = "> 40"
#double checked count
# pur_data[['age_bin', 'Age']].count()

# counts purchases by age bin by counting screen names (non-unique)
pur_count_age = pd.DataFrame(pur_data.groupby('age_bin')['SN'].count())
#finds avg price of purchases by age bin
avg_price_age = pd.DataFrame(pur_data.groupby('age_bin')['Price'].mean())
#finds total purchase value by age bin
tot_pur_age = pd.DataFrame(pur_data.groupby('age_bin')['Price'].sum())
#deletes multiple occurances of SN while only keeping last, then counts # of unique
#players by age bin
no_dup_age = pd.DataFrame(pur_data.drop_duplicates('SN', keep = 'last').groupby('age_bin')['SN'].count())
#merges all info from above into one df
merge_age = pd.merge(pur_count_age, avg_price_age, left_index = True, right_index = True).merge(tot_pur_age, left_index = True, right_index = True).merge(no_dup_age, left_index = True, right_index = True)
#renames columns
merge_age.rename(columns = {"SN_x": "# of Purchases", "Price_x": "Average Purchase Price", "Price_y": "Total Purchase Value", "SN_y": "# of Purchasers"}, inplace = True)
#calculates normalized totals
merge_age['Normalized Totals'] = merge_age['Total Purchase Value']/merge_age['# of Purchasers']
#rest index for aesthetics
merge_age.index.rename("Age", inplace = True)
# formats
merge_age.style.format({'Average Purchase Price': '${:.2f}', 'Total Purchase Value': '${:.2f}', 'Normalized Totals': '${:.2f}'})

Unnamed: 0_level_0,# of Purchases,Average Purchase Price,Total Purchase Value,# of Purchasers,Normalized Totals
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10 - 14,35,$2.77,$96.95,23,$4.22
15 - 19,133,$2.91,$386.42,100,$3.86
20 - 24,336,$2.91,$978.77,259,$3.78
25 - 29,125,$2.96,$370.33,87,$4.26
30 - 34,64,$3.08,$197.25,47,$4.20
35 - 39,42,$2.84,$119.40,27,$4.42
< 10,28,$2.98,$83.46,19,$4.39
> 40,17,$3.16,$53.75,11,$4.89


# Top Spenders

In [43]:
# Identify the the top 5 spenders in the game by total purchase value, then list (in a table):
# SN
# Purchase Count
# Average Purchase Price
# Total Purchase Value

#Group by screen name to find, total purchase per person, number of purchases per person, and average price price per person
purchase_amt_by_SN = pd.DataFrame(pur_data.groupby('SN')['Price'].sum())
num_purchase_by_SN = pd.DataFrame(pur_data.groupby('SN')['Price'].count())
avg_purchase_by_SN = pd.DataFrame(pur_data.groupby('SN')['Price'].mean())
# merge the above dfs
merged_top5 = pd.merge(purchase_amt_by_SN, num_purchase_by_SN, left_index = True, right_index = True).merge(avg_purchase_by_SN, left_index=True, right_index=True)
# rename columns
merged_top5.rename(columns = {'Price_x': 'Total Purchase Value', 'Price_y':'Purchase Count', 'Price':'Average Purchase Price'}, inplace = True)
# sort from highest purchase value to lowest
merged_top5.sort_values('Total Purchase Value', ascending = False, inplace=True)
# take top 5 only
merged_top5 = merged_top5.head()
# format
merged_top5.style.format({'Total Purchase Value': '${:.2f}', 'Average Purchase Price': '${:.2f}'})

Unnamed: 0_level_0,Total Purchase Value,Purchase Count,Average Purchase Price
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Undirrala66,$17.06,5,$3.41
Saedue76,$13.56,4,$3.39
Mindimnya67,$12.74,4,$3.18
Haellysu29,$12.73,3,$4.24
Eoda93,$11.58,3,$3.86


## Most Popular Items

In [44]:
# Identify the 5 most popular items by purchase count, then list (in a table):
# Item ID
# Item Name
# Purchase Count
# Item Price
# Total Purchase Value

# gets a count of each item by grouping by Item ID and counting the number of each IDs occurances
top5_items_ID = pd.DataFrame(pur_data.groupby('Item ID')['Item ID'].count())
#sort from high to low total purchase count
top5_items_ID.sort_values('Item ID', ascending = False, inplace = True)
#keep the first 6 rows because there is a tie
top5_items_ID = top5_items_ID.iloc[0:6][:]
#find the total purchase value of each item
top5_items_total = pd.DataFrame(pur_data.groupby('Item ID')['Price'].sum())
#merge purcahse count and total purcahse value 
top5_items = pd.merge(top5_items_ID, top5_items_total, left_index = True, right_index = True)
#drop duplicate items from original Df
no_dup_items = pur_data.drop_duplicates(['Item ID'], keep = 'last')
# merge to get all other info from the top 6 using the no dup df
top5_merge_ID = pd.merge(top5_items, no_dup_items, left_index = True, right_on = 'Item ID')
#keep only neede columns
top5_merge_ID = top5_merge_ID[['Item ID', 'Item Name', 'Item ID_x', 'Price_y', 'Price_x']]
#reset index as item ID for aesthetics
top5_merge_ID.set_index(['Item ID'], inplace = True)
# rename columns
top5_merge_ID.rename(columns =  {'Item ID_x': 'Purchase Count', 'Price_y': 'Item Price', 'Price_x': 'Total Purchase Value'}, inplace=True)
#format
top5_merge_ID.style.format({'Item Price': '${:.2f}', 'Total Purchase Value': '${:.2f}'})

Unnamed: 0_level_0,Item Name,Purchase Count,Item Price,Total Purchase Value
Item ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39,"Betrayal, Whisper of Grieving Widows",11,$2.35,$25.85
84,Arcane Gem,11,$2.23,$24.53
31,Trickster,9,$2.07,$18.63
175,Woeful Adamantite Claymore,9,$1.24,$11.16
13,Serenity,9,$1.49,$13.41
34,Retribution Axe,9,$4.14,$37.26


## Most Profitable Items

In [45]:
# Most Profitable Items

# Identify the 5 most profitable items by total purchase value, then list (in a table):
# Item ID
# Item Name
# Purchase Count
# Item Price
# Total Purchase Value

# find total purcahse value and sort by high to low
top5_profit = pd.DataFrame(pur_data.groupby('Item ID')['Price'].sum())
top5_profit.sort_values('Price', ascending = False, inplace = True)
# only keep top 5
top5_profit = top5_profit.iloc[0:5][:]
#get item purchase count
pur_count_profit = pd.DataFrame(pur_data.groupby('Item ID')['Item ID'].count())

top5_profit = pd.merge(top5_profit, pur_count_profit, left_index = True, right_index = True, how = 'left')
top5_merge_profit = pd.merge(top5_profit, no_dup_items, left_index = True, right_on = 'Item ID', how = 'left')
top5_merge_profit = top5_merge_profit[['Item ID', 'Item Name', 'Item ID_x', 'Price_y','Price_x']]
top5_merge_profit.set_index(['Item ID'], inplace=True)
top5_merge_profit.rename(columns = {'Item ID_x': 'Purchase Count', 'Price_y': 'Item Price', 'Price_x': 'Total Purchase Value'}, inplace = True)
top5_merge_profit.style.format({'Item Price': '${:.2f}', 'Total Purchase Value': '${:.2f}'})

Unnamed: 0_level_0,Item Name,Purchase Count,Item Price,Total Purchase Value
Item ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34,Retribution Axe,9,$4.14,$37.26
115,Spectral Diamond Doomblade,7,$4.25,$29.75
32,Orenmir,6,$4.95,$29.70
103,Singed Scalpel,6,$4.87,$29.22
107,"Splitter, Foe Of Subtlety",8,$3.61,$28.88


## Other Analysis - Highest Priced Items

In [46]:
highest_priced = no_dup_items.sort_values('Price', ascending = False)
highest_priced[['Item ID', 'Item Name', 'Price']].head(18)

Unnamed: 0,Item ID,Item Name,Price
657,32,Orenmir,4.95
670,177,"Winterthorn, Defender of Shifting Worlds",4.89
716,103,Singed Scalpel,4.87
336,173,Stormfury Longsword,4.83
419,42,The Decapitator,4.82
436,131,Fury,4.82
398,96,Blood-Forged Skeletal Spine,4.77
455,137,"Aetherius, Boon of the Blessed",4.75
686,46,Hopeless Ebon Dualblade,4.75
743,134,Undead Crusader,4.67


## Other Analysis - Lowest Priced

In [47]:
lowest_priced = no_dup_items.sort_values('Price', ascending = True)
lowest_priced[['Item ID', 'Item Name', 'Price']].head(18)

Unnamed: 0,Item ID,Item Name,Price
667,15,Soul Infused Crystal,1.03
771,25,Hero Cane,1.03
624,95,Singed Onyx Warscythe,1.03
723,69,"Frenzy, Defender of the Harvest",1.06
430,74,Yearning Crusher,1.06
720,82,Nirvana,1.11
774,123,Twilight's Carver,1.14
647,156,Soul-Forged Steel Shortsword,1.16
467,41,Orbit,1.16
756,6,Rusty Skull,1.2


# Other Analysis - Gender Purchase Total %s

In [48]:
pur_analysis_gen.style.format({'Total Purchase Value': '${:.2f}', 'Average Purchase Price': '${:.2f}', 'Normalized Totals': '${:.2f}'})

Unnamed: 0_level_0,# of Purchases,Total Purchase Value,Average Purchase Price,Normalized Totals
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,136,$382.91,$2.82,$3.83
Male,633,$1867.68,$2.95,$4.02
Other / Non-Disclosed,11,$35.74,$3.25,$4.47


In [69]:
percent_total_gen = pur_analysis_gen['Total Purchase Value']/total_rev
percent_total_gen

Gender
Female                   0.167478
Male                     0.816890
Other / Non-Disclosed    0.015632
Name: Total Purchase Value, dtype: float64