In [1]:
# Dependencies
import pandas as pd

In [2]:
# Save file path to variable
purchase_data_csv = 'Resources/purchase_data.csv'

In [3]:
# Read with Pandas
heroes_df = pd.read_csv(purchase_data_csv)
heroes_df.head()


Unnamed: 0,Purchase ID,SN,Age,Gender,Item ID,Item Name,Price
0,0,Lisim78,20,Male,108,"Extraction, Quickblade Of Trembling Hands",3.53
1,1,Lisovynya38,40,Male,143,Frenzied Scimitar,1.56
2,2,Ithergue48,24,Male,92,Final Critic,4.88
3,3,Chamassasya86,24,Male,100,Blindscythe,3.27
4,4,Iskosia90,23,Male,131,Fury,1.44


In [4]:
# Player Count: Total Number of Players
player_demo_df = heroes_df.loc[:,['SN', 'Age','Gender']]
player_demo_df = player_demo_df.drop_duplicates()
total_players = player_demo_df.count()[0]
total_players_dict = [{"Total Players":total_players}]
total_players_df = pd.DataFrame(total_players_dict)
total_players_df

Unnamed: 0,Total Players
0,576


In [5]:
# Purchasing Analysis (Total)

# Number of Unique Items
items = heroes_df['Item Name'].nunique()

# Total Number of Purchases
total_number_purchases = heroes_df['Purchase ID'].count()

# Total Revenue
total_revenue = heroes_df['Price'].sum()

# Average Purchase Price
average_purchase_price = total_revenue/total_number_purchases

# organize and format the output
purchase_analysis_list = [{"Number of Unique Items": items,
                           "Average Price": average_purchase_price,
                           "Number of Purchases": total_number_purchases,
                            "Total Revenue": total_revenue }]
purchase_analysis_df = pd.DataFrame(purchase_analysis_list)

purchase_analysis_df['Average Price'] = purchase_analysis_df['Average Price'].astype(float).map("${:,.2f}".format)
purchase_analysis_df['Total Revenue'] = purchase_analysis_df['Total Revenue'].astype(float).map("${:,.2f}".format)

purchase_analysis_df

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,$3.05,780,"$2,379.77"


In [6]:
# Gender Demographics
gender_summary = player_demo_df['Gender'].value_counts()

gender_percent = gender_summary/total_players*100

gender_summary_df = pd.DataFrame({"Total Count":gender_summary,
                                 "Percent of Players":gender_percent})

gender_summary_df['Percent of Players'] = gender_summary_df['Percent of Players'].astype(float).map("{0:.2f}%".format)

gender_summary_df


Unnamed: 0,Total Count,Percent of Players
Male,484,84.03%
Female,81,14.06%
Other / Non-Disclosed,11,1.91%


In [8]:
# Purchasing Analysis (Gender)
# the below, each broken by gender
gender_df = heroes_df.loc[:,['Purchase ID','SN', 'Age','Gender', 'Price']]
gender_df

#group by

#gender_group = gender_df.groupby(['Purchase ID']).count()['Gender']
#gender_group.head()

Unnamed: 0,Purchase ID,SN,Age,Gender,Price
0,0,Lisim78,20,Male,3.53
1,1,Lisovynya38,40,Male,1.56
2,2,Ithergue48,24,Male,4.88
3,3,Chamassasya86,24,Male,3.27
4,4,Iskosia90,23,Male,1.44
...,...,...,...,...,...
775,775,Aethedru70,21,Female,3.54
776,776,Iral74,21,Male,1.63
777,777,Yathecal72,20,Male,3.46
778,778,Sisur91,7,Male,4.19


In [9]:
gender_purchases_count = gender_df.groupby(["Gender"])

gender_purchases_count.head()

#gender_group.head()
#gender_purchases_count.count()

Unnamed: 0,Purchase ID,SN,Age,Gender,Price
0,0,Lisim78,20,Male,3.53
1,1,Lisovynya38,40,Male,1.56
2,2,Ithergue48,24,Male,4.88
3,3,Chamassasya86,24,Male,3.27
4,4,Iskosia90,23,Male,1.44
9,9,Chanosian48,35,Other / Non-Disclosed,3.58
15,15,Lisassa64,21,Female,2.89
18,18,Reunasu60,22,Female,4.9
22,22,Siarithria38,38,Other / Non-Disclosed,3.81
38,38,Reulae52,10,Female,4.18


In [10]:
# Purchase Count: 
gender_purchases = gender_purchases_count.count()['Purchase ID']  

print(gender_purchases)

Gender
Female                   113
Male                     652
Other / Non-Disclosed     15
Name: Purchase ID, dtype: int64


In [11]:
## Total Purchase Value
total_purchase_value = gender_df.groupby(["Gender"]).sum()['Price']

total_purchase_value


Gender
Female                    361.94
Male                     1967.64
Other / Non-Disclosed      50.19
Name: Price, dtype: float64

In [12]:
## Average Purchase Price
average_purchase_price = gender_df.groupby(['Gender']).mean()['Price']

average_purchase_price

Gender
Female                   3.203009
Male                     3.017853
Other / Non-Disclosed    3.346000
Name: Price, dtype: float64

In [16]:
## Average Purchase Total Per Person

#average_person_purchase = total_purchase_value / gender_purchases

#average_person_purchase
# Ruchi

average_total_purchase_per_person = total_purchase_value /gender_summary_df['Total Count']

#average_total_purchase_per_person.head()


In [17]:
# Add to the summary dataframe 
purchase_by_gender_dict = {'Purchase Count':gender_purchases,
                          'Average Purchase Price':average_purchase_price,
                          'Total Purchase Value': total_purchase_value,
                          'Average Total Purchase per Person': average_total_purchase_per_person}



purchase_by_gender_summary_df = pd.DataFrame(purchase_by_gender_dict)

purchase_by_gender_summary_df['Average Purchase Price'] = purchase_by_gender_summary_df['Average Purchase Price'].astype(float).map("${:,.2f}".format)
purchase_by_gender_summary_df['Total Purchase Value'] = purchase_by_gender_summary_df['Total Purchase Value'].astype(float).map("${:,.2f}".format)
purchase_by_gender_summary_df['Average Total Purchase per Person'] = purchase_by_gender_summary_df['Average Total Purchase per Person'].astype(float).map("${:,.2f}".format)

purchase_by_gender_summary_df

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Average Total Purchase per Person
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,113,$3.20,$361.94,$4.47
Male,652,$3.02,"$1,967.64",$4.07
Other / Non-Disclosed,15,$3.35,$50.19,$4.56


In [18]:
# Age Demographics
# The below, each broken into bins of 4 years (i.i < 10, 10-14, 15-19, etc.)

bins = [0,9.9,14.9,19.9,24.9,29.9,34.9,39.9, 100]  #note to self, max age is 45

group_names = ["under 10","10-14","15-19","20-24", "25-29","30-34","35-39","40+"]

heroes_df["Age Groups"] = pd.cut(heroes_df["Age"], bins, labels=group_names, include_lowest=True)
heroes_df

# TODO ... the order should be by Age Group... what happened?

age_summary = heroes_df['Age Groups'].value_counts()

age_percent = age_summary/total_players*100

age_summary_df = pd.DataFrame({"Total Count":age_summary,
                                 "Percent of Players":age_percent})

age_summary_df['Percent of Players'] = age_summary_df['Percent of Players'].astype(float).map("{0:.2f}%".format)


age_summary_df.sort_index()  # otherwise use .sort_values('column name') to sort by a column

Unnamed: 0,Total Count,Percent of Players
under 10,23,3.99%
10-14,28,4.86%
15-19,136,23.61%
20-24,365,63.37%
25-29,101,17.53%
30-34,73,12.67%
35-39,41,7.12%
40+,13,2.26%


In [20]:
#age_summary_df['Total Count']

In [21]:
# Purchase Count
age_purchase_count =  heroes_df.groupby(['Age Groups']).count()['Price']
age_purchase_count

Age Groups
under 10     23
10-14        28
15-19       136
20-24       365
25-29       101
30-34        73
35-39        41
40+          13
Name: Price, dtype: int64

In [22]:
# Average Purchase Price
age_average_purch_price = heroes_df.groupby(['Age Groups']).mean()['Price']
age_average_purch_price

# Total Purchase Value
age_total_purchase_value = heroes_df.groupby(['Age Groups']).sum()['Price']
age_total_purchase_value

## Average Purchase Total Per Person By Age Group
age_average_person_purchase = age_total_purchase_value / age_summary_df['Total Count']

under_10 = heroes_df.loc[heroes_df['Age Groups'] == "under 10",:].count()[0]
age_10_14 = heroes_df.loc[heroes_df['Age Groups'] == "10-14",:].count()[0]
age_15_19 = heroes_df.loc[heroes_df['Age Groups'] == "15-19",:].count()[0]
age_20_24 = heroes_df.loc[heroes_df['Age Groups'] == "20-24",:].count()[0]
age_25_29 = heroes_df.loc[heroes_df['Age Groups'] == "25-29",:].count()[0]
age_30_34 = heroes_df.loc[heroes_df['Age Groups'] == "30-34",:].count()[0]
age_35_39 = heroes_df.loc[heroes_df['Age Groups'] == "35-39",:].count()[0]
age_40_plus = heroes_df.loc[heroes_df['Age Groups'] == "40+",:].count()[0]


age_summary_dict = {"Purchase Count": [under_10, age_10_14, age_15_19,age_20_24,age_25_29,age_30_34,age_35_39,age_40_plus],
                   "Average Purchase Price":age_average_purch_price,
                   "Total Purchase Value": age_total_purchase_value,
                   "Average Purchase Total per Person":age_average_person_purchase}

age_summary_df = pd.DataFrame(age_summary_dict)
age_summary_df['Average Purchase Price'] = age_summary_df['Average Purchase Price'].astype(float).map("${:,.2f}".format)
age_summary_df['Total Purchase Value'] = age_summary_df['Total Purchase Value'].astype(float).map("${:,.2f}".format)
age_summary_df['Average Purchase Total per Person'] = age_summary_df['Average Purchase Total per Person'].astype(float).map("${:,.2f}".format)

age_summary_df

Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Average Purchase Total per Person
under 10,23,$3.35,$77.13,$3.35
10-14,28,$2.96,$82.78,$2.96
15-19,136,$3.04,$412.89,$3.04
20-24,365,$3.05,"$1,114.06",$3.05
25-29,101,$2.90,$293.00,$2.90
30-34,73,$2.93,$214.00,$2.93
35-39,41,$3.60,$147.67,$3.60
40+,13,$2.94,$38.24,$2.94


In [25]:
# Top Spenders
###NOTE: the solution shows all of the calculations in the same table... but the Average Purchase 
### Price for Top Spenders doesn't match with the SNs for the other values... the listing is different.

# Identify the top 5 spenders in the game by total purchase value, then list (in a table):
spenders_total_purchase_value = heroes_df.groupby("SN").sum()['Price']
spenders_total_purchase_value.head(10)

SN
Adairialis76    2.28
Adastirin33     4.48
Aeda94          4.91
Aela59          4.32
Aelaria33       1.79
Aelastirin39    7.29
Aelidru27       1.09
Aelin32         8.98
Aelly27         6.79
Aellynun67      3.74
Name: Price, dtype: float64

In [26]:
spenders_purchase_count =  heroes_df.groupby(['SN']).count()['Purchase ID']
#spenders_purchase_count.head(10)

spenders_purchase_mean =  heroes_df.groupby(['SN']).mean()['Price']
spenders_purchase_mean

SN
Adairialis76     2.280000
Adastirin33      4.480000
Aeda94           4.910000
Aela59           4.320000
Aelaria33        1.790000
                   ...   
Yathecal82       2.073333
Yathedeu43       3.010000
Yoishirrala98    4.580000
Zhisrisu83       3.945000
Zontibe81        2.676667
Name: Price, Length: 576, dtype: float64

In [36]:
# Average Purchase Price ... calculate this before resorting and indexing all of the things
spenders_average_purchase_price = spenders_total_purchase_value / spenders_purchase_count

spenders_average_purchase_price_df = pd.DataFrame({'Purchase Count':spenders_purchase_count,
                                                    'Average Purchase Price':spenders_average_purchase_price,
                                                    'Total Purchase Value': spenders_total_purchase_value})
spenders_average_purchase_price_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 576 entries, Adairialis76 to Zontibe81
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Purchase Count          576 non-null    int64  
 1   Average Purchase Price  576 non-null    float64
 2   Total Purchase Value    576 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 18.0+ KB


In [35]:
spenders_average_purchase_price_df = spenders_average_purchase_price_df.sort_values(['Average Purchase Price'], ascending=False)
#### QUESTION:  the formatting line below takes it out of table format... why?
spenders_average_purchase_price_df = spenders_average_purchase_price_df['Average Purchase Price'].astype(float).map("${:,.2f}".format)
#spenders_average_purchase_price_df = spenders_average_purchase_price_df['Total Purchase Value'].astype(float).map("${:,.2f}".format)

spenders_average_purchase_price_df.sort_values(by=['Average Purchase Price'], ascending=False)
#### sort_values('spenders_total_purchase_value') by 'Spender Total Purchase Value'
spenders_average_purchase_price_df.head()  #tail()

TypeError: unhashable type: 'list'

In [None]:
# now sort and index the things to get the top five
spenders_df = spenders_total_purchase_value.reset_index()

spenders_df = spenders_df.sort_values(['Price'], ascending=False)
spenders_df = spenders_df[['SN','Price']]
spenders_df = spenders_df.rename(columns={'Price':'Total Purchase Value'})
spenders_df['Total Purchase Value'] = spenders_df['Total Purchase Value'].astype(float).map("${:,.2f}".format)
spenders_df.head()

In [None]:
# Purchase Count

spenders_purchase_count_df = spenders_purchase_count.reset_index()

spenders_purchase_count_df = spenders_purchase_count_df.sort_values(['Purchase ID'], ascending=False)

spenders_purchase_count_df = spenders_purchase_count_df.rename(columns={'Purchase ID':'Purchase Count'})

spenders_purchase_count_df.head()


In [None]:
# Most Popular Items
# Identify the 5 most popular items by purchase count, then list (in a table):
#  Item ID
#  Item Name
#  Purchase Count
#  Item Price
#  Total Purchase Value

heroes_df.head()
most_popular_item_df = heroes_df.loc[:,['Purchase ID', 'Item ID','Item Name','Price']]

most_popular_total_purchase_value = most_popular_item_df.groupby(['Item ID']).sum()['Price']

most_popular_total_purchase_value  #179 items


In [None]:
most_popular_count = pd.DataFrame(heroes_df.groupby("Item ID")['Item Name'].count())
most_popular_count = most_popular_count.rename(columns={'Purchase ID':'Purchase Count'})

# count
# sum of price
# mean of price

most_popular_count['Total Purchase Value'] = most_popular_total_purchase_value

most_popular_count   #179 items




In [None]:
most_popular_df = most_popular_count.join(heroes_df, on='Item ID', how='left', lsuffix='_l')

most_popular_df = most_popular_df.loc[:,['Item Name','Purchase Count', 'Price','Total Purchase Value']]

most_popular_df = most_popular_df.sort_values('Total Purchase Value', ascending=False)

most_popular_df.head()

In [None]:

most_popular_df = most_popular_df.rename(columns={'Purchase ID_l':'Purchase Count',
                                                 'Price':'Item Price'})

org_most_popular_df = most_popular_df[['Item ID','Item Name','Purchase Count','Item Price','Total Purchase Value']]
org_most_popular_df.rename(index={'Item ID':'Index'})
org_most_popular_df = org_most_popular_df.sort_values('Purchase Count', ascending=False)
org_most_popular_df.head()

In [None]:
# Most Profitable Items
# Identify the 5 most profitable items by total purchase value, then list (in a table)
#  Item ID
#  Item Name
#  Purchase Count
#  Item Price
#  Total Purchase Value

org_most_popular_df = org_most_popular_df.sort_values('Total Purchase Value', ascending=False)
org_most_popular_df