### Anonymise data

To protect the privacy of individual players I am anonymising the data set. To do this I will shift the time, create new unique player and tournament identifiers and change the buy in level.

Changes have been made in such a way as to have no impact on the conclusions drawn by any analysis of the data.



In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import copy

In [2]:
ts=pd.read_csv('Tournament_summary.csv')

### Anonymise tournament summaries

In [3]:
ts['total_buyin']=[51 for i in range(len(ts))]

ts['prize_buyin']=[50 for i in range(len(ts))]

ts['rake_buyin']=[1 for i in range(len(ts))]

ts['first_place_prize']=[100 for i in range(len(ts))]

ts.drop(columns=['day_of_week', 'time_zone'], inplace=True)

ts['tournament_id']=[i+1 for i in range(len(ts))]

In [4]:
ts['date_time']=ts['date_time'].apply(pd.Timestamp)

ts=ts[ts.date_time>pd.Timestamp('?????')].sort_values(by='date_time') #filter time deleted

ts['tournament_id']=[i+1 for i in range(len(ts))]

In [5]:
time_change=timedelta(days=?????,  seconds=?????)#days and seconds deleted

In [6]:
ts['date_time']=ts['date_time']-time_change

In [7]:
ts.set_index('date_time', inplace=True)

### Change names

In [9]:
all_players=np.unique(list(ts.first_place_id)+list(ts.second_place_id))

In [10]:
len(all_players)

14483

In [11]:
#import and initalise fake name generator
from faker import Faker
fake = Faker()

In [12]:
funny_names2=[]

#create list of fake names equal in size to list of all players
for i in range(len(all_players)):
    funny_names2.append(fake.name())

funny_names2=list(np.unique(funny_names2))
    
#replace duplicate names with unique ones
while len(funny_names2)< len(all_players):
    funny_names2.append(fake.name())
    funny_names2=list(np.unique(funny_names2))


In [13]:
#name converter dictionary
conv=zip(all_players, funny_names2)
conv_dict={i:j for i,j in conv}


In [14]:
ts['first_place_id']=[conv_dict[name] for name in ts['first_place_id']]

In [15]:
ts['second_place_id']=[conv_dict[name] for name in ts['second_place_id']]

### Player summary dataframe

In [17]:
## Construct player results data frame

#Make data frame of all unique players
player_df=pd.DataFrame(np.unique((list(ts.first_place_id)+list(ts.second_place_id))), columns=['player_id'])

player_df.head()

Unnamed: 0,player_id
0,Aaron Barnes
1,Aaron Bell Jr.
2,Aaron Castillo
3,Aaron Cruz
4,Aaron Diaz


In [18]:
def get_game_count(player_id, ts=ts):
    return sum((player_id ==ts.first_place_id)*1)+sum((player_id==ts.second_place_id)*1)

def get_total_profit(player_id, ts=ts):
    info=ts[(ts.first_place_id==player_id) | (ts.second_place_id ==player_id)]
    return round(sum((info.first_place_id==player_id)*info.first_place_prize)-sum(info.total_buyin),2)

def get_avg_roi(player_id, ts=ts):
    #calculate roi for each individual game and take average
    #combine with average stake and game count to give most accurate
    #representation of skill level
    info=ts[(ts.first_place_id==player_id) | (ts.second_place_id ==player_id)]
    roi=(((info.first_place_id==player_id)*info.first_place_prize)/info.total_buyin).mean()
    return round((roi-1)*100,2)

def get_avg_stake(player_id, ts=ts):
    return round(ts[(ts.first_place_id==player_id) | (ts.second_place_id ==player_id)].total_buyin.mean(),2)

In [19]:
#build player results dataframe
#this function cell takes a bit of time to run because iterating a function with a mask in it takes long

player_df['game_count']= player_df.player_id.apply(get_game_count)

player_df['total_profit']= player_df.player_id.apply(get_total_profit)

player_df['avg_roi_%']= player_df.player_id.apply(get_avg_roi)

player_df['avg_stake']= player_df.player_id.apply(get_avg_stake)

### Hourly traffic dataframe

In [20]:
def get_named_day_of_week(day):
    "Return level in integer format"
    weekday_dict={0: 'mon', 1: 'tue', 2: 'wed', 3: 'thur',
                  4: 'fri', 5:'sat' , 6: 'sun'}

    return weekday_dict[day]

In [21]:
#sample tournament summaries with hourly interval, taking count of each hour
hourly_traffic=pd.DataFrame(ts.first_place_id.resample('H').count())

#change column name to count
hourly_traffic.columns=['game_count']

#add weekday info
hourly_traffic['day_of_week']=[get_named_day_of_week(x.weekday()) for x in hourly_traffic.index]

In [30]:
ts.head()

Unnamed: 0_level_0,tournament_id,total_buyin,prize_buyin,rake_buyin,first_place_id,second_place_id,first_place_prize,finishing_level
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-11-04 16:00:01,1,51,50,1,Mark Hunter,Juan Avery,100,1
2015-11-04 16:07:49,2,51,50,1,Michelle Wiley,Dana Brown,100,1
2015-11-04 16:12:36,3,51,50,1,Dana Brown,Richard Myers,100,2
2015-11-04 16:21:04,4,51,50,1,Dana Brown,Mary Campbell,100,1
2015-11-04 16:21:54,5,51,50,1,Jesse Myers,Jonathon Hernandez,100,3


In [23]:
hourly_traffic.head()

Unnamed: 0_level_0,game_count,day_of_week
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-11-04 16:00:00,20,wed
2015-11-04 17:00:00,15,wed
2015-11-04 18:00:00,14,wed
2015-11-04 19:00:00,27,wed
2015-11-04 20:00:00,36,wed


In [24]:
player_df.head()

Unnamed: 0,player_id,game_count,total_profit,avg_roi_%,avg_stake
0,Aaron Barnes,1,49,96.08,51.0
1,Aaron Bell Jr.,3,-53,-34.64,51.0
2,Aaron Castillo,2,-102,-100.0,51.0
3,Aaron Cruz,1,-51,-100.0,51.0
4,Aaron Diaz,23,-573,-48.85,51.0


### Save dataframes

In [31]:
#save tables to csv
player_df.to_csv('Player_summary.csv', index=False)
ts.to_csv('Tournament_summary.csv')
hourly_traffic.to_csv('hourly_traffic.csv')