# 01-Exploratory
* The purpose of this notebook is to explore and work through a rough strategy to clean and sort the data. 
* No need for a full ETL pipeline at this notebook.
* Once a clear and clean strategy is in place, we will recreate it in the ETL notebook. 
* Below is a Picture of what the database structure should look like.
---
![playerDB](../DB/PlayerDB.png)

### Dependencies

In [164]:
# Import dependencies
import pandas as pd
# For data base creation
from sqlalchemy import create_engine
# from config import db_password
import os
import os.path, sys
import glob
import csv
import chardet

### <a name='tournaments'></a>Tournament Data

In [165]:
file = '../../atp-world-tour-tennis-data/csv/1_tournaments/'

# assign path
path, dirs, files = next(os.walk(file))
file_count = len(files)
# create empty list
dataframes_list = []
 
# append datasets to the list
for i in range(file_count):
    with open(file+files[i], 'rb') as rawdata:
        result = chardet.detect(rawdata.read(100000))
    temp_df = pd.read_csv(file+files[i], encoding=result['encoding'], header=None)
    dataframes_list.append(temp_df)

df = pd.concat(dataframes_list)

print(len(df))
df.head()

4865


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,1877-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1877.07.09,1877,7.0,...,spencer-gore,gi91,,,,,,,,
1,1878-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1878.07.08,1878,7.0,...,frank-hadow,hg50,,,,,,,,
2,1879-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1879.07.07,1879,7.0,...,john-hartley,hg35,,,,,,,,
3,1880-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1880.07.05,1880,7.0,...,john-hartley,hg35,,,,,,,,
4,1881-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1881.07.02,1881,7.0,...,william-renshaw,rg71,,,,,,,,


In [166]:
df.columns.tolist()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30]

In [167]:
df.drop(columns=[1,4,5,8,9,10,12,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30], inplace=True)
df.head()

Unnamed: 0,0,2,3,6,7,11,13,14,22
0,1877-540,Grand Slam,Wimbledon,"London, Great Britain",1877.07.09,32,Outdoor,Grass,gi91
1,1878-540,Grand Slam,Wimbledon,"London, Great Britain",1878.07.08,64,Outdoor,Grass,hg50
2,1879-540,Grand Slam,Wimbledon,"London, Great Britain",1879.07.07,64,Outdoor,Grass,hg35
3,1880-540,Grand Slam,Wimbledon,"London, Great Britain",1880.07.05,64,Outdoor,Grass,hg35
4,1881-540,Grand Slam,Wimbledon,"London, Great Britain",1881.07.02,64,Outdoor,Grass,rg71


In [169]:

df = df.reindex(columns=[0,2,3,7,6,11,13,14,22])
df.head()
header_df = pd.read_csv("../temp/Tourney_header.csv")
header_df.head()


Unnamed: 0,Turney_id,Name,temp,Date,Location,Draw_size,Conditions,Surface,Winner_id


In [171]:
df.columns = header_df.columns
print(len(df.columns))
df.head()

9


Unnamed: 0,Turney_id,Name,temp,Date,Location,Draw_size,Conditions,Surface,Winner_id
0,1877-540,Grand Slam,Wimbledon,1877.07.09,"London, Great Britain",32,Outdoor,Grass,gi91
1,1878-540,Grand Slam,Wimbledon,1878.07.08,"London, Great Britain",64,Outdoor,Grass,hg50
2,1879-540,Grand Slam,Wimbledon,1879.07.07,"London, Great Britain",64,Outdoor,Grass,hg35
3,1880-540,Grand Slam,Wimbledon,1880.07.05,"London, Great Britain",64,Outdoor,Grass,hg35
4,1881-540,Grand Slam,Wimbledon,1881.07.02,"London, Great Britain",64,Outdoor,Grass,rg71


In [172]:
df.dtypes

Turney_id     object
Name          object
temp          object
Date          object
Location      object
Draw_size      int64
Conditions    object
Surface       object
Winner_id     object
dtype: object

In [173]:
#Combinding coloumns 2,3
df["Name"] = df["Name"]+"-"+ df["temp"]
df.head()

Unnamed: 0,Turney_id,Name,temp,Date,Location,Draw_size,Conditions,Surface,Winner_id
0,1877-540,Grand Slam-Wimbledon,Wimbledon,1877.07.09,"London, Great Britain",32,Outdoor,Grass,gi91
1,1878-540,Grand Slam-Wimbledon,Wimbledon,1878.07.08,"London, Great Britain",64,Outdoor,Grass,hg50
2,1879-540,Grand Slam-Wimbledon,Wimbledon,1879.07.07,"London, Great Britain",64,Outdoor,Grass,hg35
3,1880-540,Grand Slam-Wimbledon,Wimbledon,1880.07.05,"London, Great Britain",64,Outdoor,Grass,hg35
4,1881-540,Grand Slam-Wimbledon,Wimbledon,1881.07.02,"London, Great Britain",64,Outdoor,Grass,rg71


In [176]:
#drop Temp column
df = df.drop(columns =['temp'])
df.head()


Unnamed: 0,Turney_id,Name,Date,Location,Draw_size,Conditions,Surface,Winner_id
0,1877-540,Grand Slam-Wimbledon,1877.07.09,"London, Great Britain",32,Outdoor,Grass,gi91
1,1878-540,Grand Slam-Wimbledon,1878.07.08,"London, Great Britain",64,Outdoor,Grass,hg50
2,1879-540,Grand Slam-Wimbledon,1879.07.07,"London, Great Britain",64,Outdoor,Grass,hg35
3,1880-540,Grand Slam-Wimbledon,1880.07.05,"London, Great Britain",64,Outdoor,Grass,hg35
4,1881-540,Grand Slam-Wimbledon,1881.07.02,"London, Great Britain",64,Outdoor,Grass,rg71


### <a name="match-score-data"></a> Match Score Data

### Match Stats Data

## Working code