# 01-Exploratory
* The purpose of this notebook is to explore and work through a rough strategy to clean and sort the data. 
* No need for a full ETL pipeline at this notebook.
* Once a clear and clean strategy is in place, we will recreate it in the ETL notebook. 
* Below is a Picture of what the database structure should look like.
---
![playerDB](../DB/PlayerDB.png)

### Dependencies

In [87]:
# Import dependencies
import pandas as pd
# For data base creation
from sqlalchemy import create_engine
# from config import db_password
import os
import os.path, sys
import glob
import csv
import chardet

### <a name='tournaments'></a>Tournament Data

In [165]:
file = '../../atp-world-tour-tennis-data/csv/1_tournaments/'

# assign path
path, dirs, files = next(os.walk(file))
file_count = len(files)
# create empty list
dataframes_list = []
 
# append datasets to the list
for i in range(file_count):
    with open(file+files[i], 'rb') as rawdata:
        result = chardet.detect(rawdata.read(100000))
    temp_df = pd.read_csv(file+files[i], encoding=result['encoding'], header=None)
    dataframes_list.append(temp_df)

df = pd.concat(dataframes_list)

print(len(df))
df.head()

4865


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,1877-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1877.07.09,1877,7.0,...,spencer-gore,gi91,,,,,,,,
1,1878-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1878.07.08,1878,7.0,...,frank-hadow,hg50,,,,,,,,
2,1879-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1879.07.07,1879,7.0,...,john-hartley,hg35,,,,,,,,
3,1880-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1880.07.05,1880,7.0,...,john-hartley,hg35,,,,,,,,
4,1881-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1881.07.02,1881,7.0,...,william-renshaw,rg71,,,,,,,,


In [166]:
df.columns.tolist()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30]

In [167]:
df.drop(columns=[1,4,5,8,9,10,12,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30], inplace=True)
df.head()

Unnamed: 0,0,2,3,6,7,11,13,14,22
0,1877-540,Grand Slam,Wimbledon,"London, Great Britain",1877.07.09,32,Outdoor,Grass,gi91
1,1878-540,Grand Slam,Wimbledon,"London, Great Britain",1878.07.08,64,Outdoor,Grass,hg50
2,1879-540,Grand Slam,Wimbledon,"London, Great Britain",1879.07.07,64,Outdoor,Grass,hg35
3,1880-540,Grand Slam,Wimbledon,"London, Great Britain",1880.07.05,64,Outdoor,Grass,hg35
4,1881-540,Grand Slam,Wimbledon,"London, Great Britain",1881.07.02,64,Outdoor,Grass,rg71


In [169]:

df = df.reindex(columns=[0,2,3,7,6,11,13,14,22])
df.head()
header_df = pd.read_csv("../temp/Tourney_header.csv")
header_df.head()


Unnamed: 0,Turney_id,Name,temp,Date,Location,Draw_size,Conditions,Surface,Winner_id


In [171]:
df.columns = header_df.columns
print(len(df.columns))
df.head()

9


Unnamed: 0,Turney_id,Name,temp,Date,Location,Draw_size,Conditions,Surface,Winner_id
0,1877-540,Grand Slam,Wimbledon,1877.07.09,"London, Great Britain",32,Outdoor,Grass,gi91
1,1878-540,Grand Slam,Wimbledon,1878.07.08,"London, Great Britain",64,Outdoor,Grass,hg50
2,1879-540,Grand Slam,Wimbledon,1879.07.07,"London, Great Britain",64,Outdoor,Grass,hg35
3,1880-540,Grand Slam,Wimbledon,1880.07.05,"London, Great Britain",64,Outdoor,Grass,hg35
4,1881-540,Grand Slam,Wimbledon,1881.07.02,"London, Great Britain",64,Outdoor,Grass,rg71


In [172]:
df.dtypes

Turney_id     object
Name          object
temp          object
Date          object
Location      object
Draw_size      int64
Conditions    object
Surface       object
Winner_id     object
dtype: object

In [173]:
#Combinding coloumns 2,3
df["Name"] = df["Name"]+"-"+ df["temp"]
df.head()

Unnamed: 0,Turney_id,Name,temp,Date,Location,Draw_size,Conditions,Surface,Winner_id
0,1877-540,Grand Slam-Wimbledon,Wimbledon,1877.07.09,"London, Great Britain",32,Outdoor,Grass,gi91
1,1878-540,Grand Slam-Wimbledon,Wimbledon,1878.07.08,"London, Great Britain",64,Outdoor,Grass,hg50
2,1879-540,Grand Slam-Wimbledon,Wimbledon,1879.07.07,"London, Great Britain",64,Outdoor,Grass,hg35
3,1880-540,Grand Slam-Wimbledon,Wimbledon,1880.07.05,"London, Great Britain",64,Outdoor,Grass,hg35
4,1881-540,Grand Slam-Wimbledon,Wimbledon,1881.07.02,"London, Great Britain",64,Outdoor,Grass,rg71


In [176]:
#drop Temp column
df = df.drop(columns =['temp'])
df.head()


Unnamed: 0,Turney_id,Name,Date,Location,Draw_size,Conditions,Surface,Winner_id
0,1877-540,Grand Slam-Wimbledon,1877.07.09,"London, Great Britain",32,Outdoor,Grass,gi91
1,1878-540,Grand Slam-Wimbledon,1878.07.08,"London, Great Britain",64,Outdoor,Grass,hg50
2,1879-540,Grand Slam-Wimbledon,1879.07.07,"London, Great Britain",64,Outdoor,Grass,hg35
3,1880-540,Grand Slam-Wimbledon,1880.07.05,"London, Great Britain",64,Outdoor,Grass,hg35
4,1881-540,Grand Slam-Wimbledon,1881.07.02,"London, Great Britain",64,Outdoor,Grass,rg71


### <a name="match-score-data"></a> Match Score Data

In [88]:
# finds informations and puts it in a df
file = '../../atp-world-tour-tennis-data/csv/2_match_scores/'

# assign path
path, dirs, files = next(os.walk(file))
file_count = len(files)
# create empty list
dataframes_list = []
 
# append datasets to the list
for i in range(file_count):
    temp_df = pd.read_csv(file+files[i], encoding='utf-8', header=None)
    dataframes_list.append(temp_df)

df_W = pd.concat(dataframes_list)
df_L = pd.concat(dataframes_list)
print(len(df_W))
df_W.head()
df_L.head()

223087


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,61 62 64,3.0,0.0,18.0,7.0,0.0,0.0,1877-540-NULL-4-1-gi91-mu62,
1,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,62 65 62,3.0,0.0,18.0,9.0,0.0,0.0,1877-540-NULL-3-1-gi91-hg42,
2,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,63 62 56 61,3.0,1.0,23.0,12.0,0.0,0.0,1877-540-NULL-2-3-gi91-lh23,
3,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,63 63 65,3.0,0.0,18.0,11.0,0.0,0.0,1877-540-NULL-2-2-hg42-mu66,
4,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,65 56 64 61,3.0,1.0,23.0,16.0,0.0,0.0,1877-540-NULL-2-1-mu62-e994,


In [22]:
df.iloc[0].tolist()

['1877-540',
 1,
 'Wimbledon',
 'wimbledon',
 '/en/scores/archive/wimbledon/540/1877/results',
 '1877.07.09',
 1877.0,
 7.0,
 9.0,
 '1877.07.19',
 1877.0,
 7.0,
 19.0,
 'USD',
 0.0,
 nan,
 'Finals',
 1,
 1,
 'Spencer Gore',
 'gi91',
 'spencer-gore',
 'William Marshall',
 'mu62',
 'william-marshall',
 nan,
 nan,
 '61 62 64',
 3.0,
 0.0,
 18.0,
 7.0,
 0.0,
 0.0,
 '1877-540-NULL-4-1-gi91-mu62',
 nan]

In [89]:
# dropping un needed winner stats
df_W.drop(columns=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,23,24,25,26,28,29,30,31,32,33], inplace=True)
df_W.head()

Unnamed: 0,0,16,20,27,34,35
0,1877-540,Finals,gi91,61 62 64,1877-540-NULL-4-1-gi91-mu62,
1,1877-540,Semi-Finals,gi91,62 65 62,1877-540-NULL-3-1-gi91-hg42,
2,1877-540,Quarter-Finals,gi91,63 62 56 61,1877-540-NULL-2-3-gi91-lh23,
3,1877-540,Quarter-Finals,hg42,63 63 65,1877-540-NULL-2-2-hg42-mu66,
4,1877-540,Quarter-Finals,mu62,65 56 64 61,1877-540-NULL-2-1-mu62-e994,


In [90]:
# Dropping un neededd Loser stats
df_L.drop(columns = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,20,21,22,24,25,26,28,29,30,31,32,33,35], inplace=True)
df_L.head()

Unnamed: 0,0,16,23,27,34
0,1877-540,Finals,mu62,61 62 64,1877-540-NULL-4-1-gi91-mu62
1,1877-540,Semi-Finals,hg42,62 65 62,1877-540-NULL-3-1-gi91-hg42
2,1877-540,Quarter-Finals,lh23,63 62 56 61,1877-540-NULL-2-3-gi91-lh23
3,1877-540,Quarter-Finals,mu66,63 63 65,1877-540-NULL-2-2-hg42-mu66
4,1877-540,Quarter-Finals,e994,65 56 64 61,1877-540-NULL-2-1-mu62-e994


In [91]:
# reorder columns to match headers
df_W =df_W.reindex(columns = [34,0,20,16,27,5])
df_L=df_L.reindex(columns=[34,0,23,16,27,5])
df_W.head()
df_L.head()

Unnamed: 0,34,0,23,16,27,5
0,1877-540-NULL-4-1-gi91-mu62,1877-540,mu62,Finals,61 62 64,
1,1877-540-NULL-3-1-gi91-hg42,1877-540,hg42,Semi-Finals,62 65 62,
2,1877-540-NULL-2-3-gi91-lh23,1877-540,lh23,Quarter-Finals,63 62 56 61,
3,1877-540-NULL-2-2-hg42-mu66,1877-540,mu66,Quarter-Finals,63 63 65,
4,1877-540-NULL-2-1-mu62-e994,1877-540,e994,Quarter-Finals,65 56 64 61,


In [92]:
# setting winner&Loser headers
df_W_Header = pd.read_csv("../temp/Match_score_header_W.csv")
df_L_Header = pd.read_csv("../temp/Match_score_header_L.csv")
print(df_W_Header)

Empty DataFrame
Columns: [Match_id, Tourney_id, Player_id, Round, Score, Player_result]
Index: []


In [93]:
# connecting winner&loser headers
df_W.columns = df_W_Header.columns
df_L.columns = df_L_Header.columns
df_W.head()
df_L.head()

Unnamed: 0,Match_id,Tourney_id,Player_id,Round,Score,Player_result
0,1877-540-NULL-4-1-gi91-mu62,1877-540,mu62,Finals,61 62 64,
1,1877-540-NULL-3-1-gi91-hg42,1877-540,hg42,Semi-Finals,62 65 62,
2,1877-540-NULL-2-3-gi91-lh23,1877-540,lh23,Quarter-Finals,63 62 56 61,
3,1877-540-NULL-2-2-hg42-mu66,1877-540,mu66,Quarter-Finals,63 63 65,
4,1877-540-NULL-2-1-mu62-e994,1877-540,e994,Quarter-Finals,65 56 64 61,


In [94]:
# Setting player_results
df_W['Player_result'] = pd.Series('Won', index=df_L.index)
df_L['Player_result'] = pd.Series('Lost', index=df_L.index)
df_W.head()
df_L.head()

Unnamed: 0,Match_id,Tourney_id,Player_id,Round,Score,Player_result
0,1877-540-NULL-4-1-gi91-mu62,1877-540,mu62,Finals,61 62 64,Lost
1,1877-540-NULL-3-1-gi91-hg42,1877-540,hg42,Semi-Finals,62 65 62,Lost
2,1877-540-NULL-2-3-gi91-lh23,1877-540,lh23,Quarter-Finals,63 62 56 61,Lost
3,1877-540-NULL-2-2-hg42-mu66,1877-540,mu66,Quarter-Finals,63 63 65,Lost
4,1877-540-NULL-2-1-mu62-e994,1877-540,e994,Quarter-Finals,65 56 64 61,Lost


In [1]:
num = len(df)
all_char = []
for i in df['Player_id']:
    char = df['Player_id']
    all_char.append(char)
lengths = [len(s) for s in all_char]
print(len(all_char))
print(sum(lengths))
print(sum(lengths) / len(all_char))

NameError: name 'df' is not defined

### Match Stats Data

## Working code