### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [2]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
file = "Resources/purchase_data.csv"

# Read Purchasing File and store into Pandas data frame

main_df = pd.read_csv(file, skipinitialspace=True)
column_names = list(main_df.columns)
#print(column_names)
# column 0 = purchase ID, 1 = SN(in game name), 2 = age, 3 = gender, 4 = item id, 5 = item name, 6 = price
# may have to clean up header

# you want to use the original dataframe and not the new one
# this is looking at purchases not unique players so the filtered dataset would be incomplete
# counting with gender will get the purchase count
# Purchase Count	Average Purchase Price	Total Purchase Value	Avg Total Purchase per Person
# ex male_purchases = purchase_data.loc[purchase_data.Gender == 'Male', 'Gender'].count()

#testing with the males first
#average total purchase per person

#m_avg_purchase_df = male_df.groupby(["SN"])

# examples


# these are bins see link https://www.codespeedy.com/binning-or-bucketing-of-column-in-pandas-using-python/
# these will be placed in their own dataframe 
# cut based on these ages: <10, 10-13,15-19,20-24,25-29,30-34,35-39,40+ 
# make percentages based on this (use total players again)
#original_bins = [9,13,19,24,29,34,39,150]
#doing this for now to account for no know greater than or lesser than function atm, subject to change
#bins = pd.IntervalIndex.from_arrays([0,9],[10,13],[15,19],[20,24],[25,29],[30,34],[35,39],[40,150])
#age_df = pd.cut(main_df['Age'],original_bins)
#age_df.head()

## Player Count

* Display the total number of players


In [5]:
# to get total players, you need to see all unique player igns(column 3)
# it is case sensitive and len can be used to get the numerical value from unique()
# given the formatting, it may be necessary to assign this to a data frame later 
total_players = len(main_df['SN'].unique())
player_count_dict = {"Total Players": total_players}
player_count_df = pd.DataFrame(player_count_dict, index = [0])
player_count_df

Unnamed: 0,Total Players
0,576


## Purchasing Analysis (Total)

* Run basic calculations to obtain number of unique items, average price, etc.


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame


In [6]:
unique_items = len(main_df["Item ID"].unique())

#round to second decimal point here
average_revenue = round(main_df['Price'].mean(),2)

num_of_purchases = len(main_df["Item ID"])

# no need to round as prices only have 2 decimal places when added
total_revenue = main_df['Price'].sum()

purchasing_analysis_dict = {"Number of Unique Items": unique_items,
                           "Average Price": average_revenue,
                           "Number of Purchases": num_of_purchases,
                           "Total Revenue": total_revenue}

purchasing_analysis_df = pd.DataFrame(purchasing_analysis_dict, index = [0])
purchasing_analysis_df

Unnamed: 0,Number of Unique Items,Average Price,Number of Purchases,Total Revenue
0,179,3.05,780,2379.77


## Gender Demographics

* Percentage and Count of Male Players


* Percentage and Count of Female Players


* Percentage and Count of Other / Non-Disclosed




In [8]:
gender_demo = main_df[['SN','Gender']]
gender_checker = gender_demo.drop_duplicates()


male_players = gender_checker.loc[gender_checker.Gender == 'Male', 'Gender'].count()
female_players = gender_checker.loc[gender_checker.Gender == 'Female', 'Gender'].count()
other_players = gender_checker.loc[gender_checker.Gender == 'Other / Non-Disclosed', 'Gender'].count()

percent_of_m = "{:.2%}".format(male_players / total_players)
percent_of_f = "{:.2%}".format(female_players / total_players)
percent_of_o = "{:.2%}".format(other_players / total_players)

gender_index = ["Male","Female","Other/Non-Disclosed"]
total_counts_list = [male_players,female_players,other_players]
percent_list = [percent_of_m,percent_of_f,percent_of_o]

gender_dict = {"Total Count": total_counts_list,
              "Percentage of Players": percent_list}

summary_gender_df = pd.DataFrame(gender_dict, index = gender_index)
summary_gender_df


Unnamed: 0,Total Count,Percentage of Players
Male,484,84.03%
Female,81,14.06%
Other/Non-Disclosed,11,1.91%



## Purchasing Analysis (Gender)

* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. by gender




* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [14]:
# you want to use the original dataframe and not the new one
# this is looking at purchases not unique players so the filtered dataset would be incomplete
# counting with gender will get the purchase count
# ex male_purchases = purchase_data.loc[purchase_data.Gender == 'Male', 'Gender'].count()
# will also want to put these rows into their own data sets to analylze average price, total purchase value and average total purchase per person

#use iloc to get all rows with males in it
m_df = main_df[main_df.iloc[:,3]=='Male']

# calculate the summary data
m_count = m_df['Purchase ID'].count()
m_avg = round(male_df['Price'].mean(),2)
m_sum = male_df['Price'].sum()

# 484 is the total male population, average is 3.02 and the cost should add up to 1967.64
# male average purchase has 4.07 listed as it's expected outcome


# for average purchase for males we need to do a groupby and add up their prices
# we also need to reset index to prevent garbled data
m_avg_purchase_df = male_df.groupby(male_df['SN'])['Price'].sum().reset_index()
m_avg_purchase = round(m_avg_purchase_df['Price'].mean(),2)

#-----------
# female
# expected values are 113, 3.20, 361.94, 4.47
f_df = main_df[main_df.iloc[:,3]=='Female']

f_count = f_df['Purchase ID'].count()
f_avg = round(f_df['Price'].mean(),2)
f_sum = f_df['Price'].sum()

#
f_avg_purchase_df = f_df.groupby(f_df['SN'])['Price'].sum().reset_index()
f_avg_purchase = round(f_avg_purchase_df['Price'].mean(),2)
print(f_avg_purchase)
#-------------
# other/nondisclosed
o_df = main_df[main_df.iloc[:,3]=='Other / Non-Disclosed']

o_count = o_df['Purchase ID'].count()
o_avg = round(o_df['Price'].mean(),2)
o_sum = o_df['Price'].sum()

# other expected values are 15, 3.35, 50.19, 4.56
o_avg_purchase_df = o_df.groupby(o_df['SN'])['Price'].sum().reset_index()
o_avg_purchase = round(o_avg_purchase_df['Price'].mean(),2)


# expected outcome has the order- female,male,other so a new index is needed
gender_index2 = ["Female","Male","Other/Non-Disclosed"]
gender_analysis_dict = {"Purchase Count": [f_count,m_count,o_count],
                        "Average Purchase Price": [f_avg,m_avg,o_avg],
                        "Total Purchase Value": [f_sum,m_sum,o_sum],
                        "Avg Total Purchase per Person":[f_avg_purchase,m_avg_purchase,o_avg_purchase]}
gender_analysis_df = pd.DataFrame(gender_analysis_dict, index = gender_index2)
gender_analysis_df

4.47


Unnamed: 0,Purchase Count,Average Purchase Price,Total Purchase Value,Avg Total Purchase per Person
Female,113,3.2,361.94,4.47
Male,652,3.02,1967.64,4.07
Other/Non-Disclosed,15,3.35,50.19,4.56


## Age Demographics

* Establish bins for ages


* Categorize the existing players using the age bins. Hint: use pd.cut()


* Calculate the numbers and percentages by age group


* Create a summary data frame to hold the results


* Optional: round the percentage column to two decimal points


* Display Age Demographics Table


In [None]:
# these are bins see link https://www.codespeedy.com/binning-or-bucketing-of-column-in-pandas-using-python/
# these will be placed in their own dataframe 
# cut based on these ages: <10, 10-13,15-19,20-24,25-29,30-34,35-39,40+ 
# make percentages based on this (use total players again)


## Purchasing Analysis (Age)

* Bin the purchase_data data frame by age


* Run basic calculations to obtain purchase count, avg. purchase price, avg. purchase total per person etc. in the table below


* Create a summary data frame to hold the results


* Optional: give the displayed data cleaner formatting


* Display the summary data frame

In [None]:
# just like gender, use sthe original data frame 
# bin it by age to move the rows around into a new data frame 
# calculate accordingly 

## Top Spenders

* Run basic calculations to obtain the results in the table below


* Create a summary data frame to hold the results


* Sort the total purchase value column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



In [None]:
# count how many times a repeating sn reappears
# place those names into a list 
# start a new dataframe with those list and the necessary columns
# make a column holding these value for purchase count 
# average and total those rows values 
# place those values into the new data frame


## Most Popular Items

* Retrieve the Item ID, Item Name, and Item Price columns


* Group by Item ID and Item Name. Perform calculations to obtain purchase count, average item price, and total purchase value


* Create a summary data frame to hold the results


* Sort the purchase count column in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the summary data frame



In [None]:
# intended output is top 5 that want to be taken from the main data set
# same with player names, take the item names, count them and place the new values in a new dataframe

## Most Profitable Items

* Sort the above table by total purchase value in descending order


* Optional: give the displayed data cleaner formatting


* Display a preview of the data frame



In [None]:
# sort the new dataframe by total purchase value 
# display the new data frame