# 01-Exploratory
* The purpose of this notebook is to explore and work through a rough strategy to clean and sort the data. 
* No need for a full ETL pipeline at this notebook.
* Once a clear and clean strategy is in place, we will recreate it in the ETL notebook. 
* Below is a Picture of what the database structure should look like.
---
![playerDB](../DB/PlayerDB.png)

### Dependencies

In [1]:
# Import dependencies
import pandas as pd
# For data base creation
from sqlalchemy import create_engine
# from config import db_password
import os
import os.path, sys
import glob
import csv
import chardet
import re

### <a name='tournaments'></a>Tournament Data

In [165]:
file = '../../atp-world-tour-tennis-data/csv/1_tournaments/'

# assign path
path, dirs, files = next(os.walk(file))
file_count = len(files)
# create empty list
dataframes_list = []
 
# append datasets to the list
for i in range(file_count):
    with open(file+files[i], 'rb') as rawdata:
        result = chardet.detect(rawdata.read(100000))
    temp_df = pd.read_csv(file+files[i], encoding=result['encoding'], header=None)
    dataframes_list.append(temp_df)

df = pd.concat(dataframes_list)

print(len(df))
df.head()

4865


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,1877-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1877.07.09,1877,7.0,...,spencer-gore,gi91,,,,,,,,
1,1878-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1878.07.08,1878,7.0,...,frank-hadow,hg50,,,,,,,,
2,1879-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1879.07.07,1879,7.0,...,john-hartley,hg35,,,,,,,,
3,1880-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1880.07.05,1880,7.0,...,john-hartley,hg35,,,,,,,,
4,1881-540,1,Grand Slam,Wimbledon,540.0,wimbledon,"London, Great Britain",1881.07.02,1881,7.0,...,william-renshaw,rg71,,,,,,,,


In [166]:
df.columns.tolist()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30]

In [167]:
df.drop(columns=[1,4,5,8,9,10,12,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30], inplace=True)
df.head()

Unnamed: 0,0,2,3,6,7,11,13,14,22
0,1877-540,Grand Slam,Wimbledon,"London, Great Britain",1877.07.09,32,Outdoor,Grass,gi91
1,1878-540,Grand Slam,Wimbledon,"London, Great Britain",1878.07.08,64,Outdoor,Grass,hg50
2,1879-540,Grand Slam,Wimbledon,"London, Great Britain",1879.07.07,64,Outdoor,Grass,hg35
3,1880-540,Grand Slam,Wimbledon,"London, Great Britain",1880.07.05,64,Outdoor,Grass,hg35
4,1881-540,Grand Slam,Wimbledon,"London, Great Britain",1881.07.02,64,Outdoor,Grass,rg71


In [169]:

df = df.reindex(columns=[0,2,3,7,6,11,13,14,22])
df.head()
header_df = pd.read_csv("../temp/Tourney_header.csv")
header_df.head()


Unnamed: 0,Turney_id,Name,temp,Date,Location,Draw_size,Conditions,Surface,Winner_id


In [171]:
df.columns = header_df.columns
print(len(df.columns))
df.head()

9


Unnamed: 0,Turney_id,Name,temp,Date,Location,Draw_size,Conditions,Surface,Winner_id
0,1877-540,Grand Slam,Wimbledon,1877.07.09,"London, Great Britain",32,Outdoor,Grass,gi91
1,1878-540,Grand Slam,Wimbledon,1878.07.08,"London, Great Britain",64,Outdoor,Grass,hg50
2,1879-540,Grand Slam,Wimbledon,1879.07.07,"London, Great Britain",64,Outdoor,Grass,hg35
3,1880-540,Grand Slam,Wimbledon,1880.07.05,"London, Great Britain",64,Outdoor,Grass,hg35
4,1881-540,Grand Slam,Wimbledon,1881.07.02,"London, Great Britain",64,Outdoor,Grass,rg71


In [172]:
df.dtypes

Turney_id     object
Name          object
temp          object
Date          object
Location      object
Draw_size      int64
Conditions    object
Surface       object
Winner_id     object
dtype: object

In [173]:
#Combinding coloumns 2,3
df["Name"] = df["Name"]+"-"+ df["temp"]
df.head()

Unnamed: 0,Turney_id,Name,temp,Date,Location,Draw_size,Conditions,Surface,Winner_id
0,1877-540,Grand Slam-Wimbledon,Wimbledon,1877.07.09,"London, Great Britain",32,Outdoor,Grass,gi91
1,1878-540,Grand Slam-Wimbledon,Wimbledon,1878.07.08,"London, Great Britain",64,Outdoor,Grass,hg50
2,1879-540,Grand Slam-Wimbledon,Wimbledon,1879.07.07,"London, Great Britain",64,Outdoor,Grass,hg35
3,1880-540,Grand Slam-Wimbledon,Wimbledon,1880.07.05,"London, Great Britain",64,Outdoor,Grass,hg35
4,1881-540,Grand Slam-Wimbledon,Wimbledon,1881.07.02,"London, Great Britain",64,Outdoor,Grass,rg71


In [176]:
#drop Temp column
df = df.drop(columns =['temp'])
df.head()


Unnamed: 0,Turney_id,Name,Date,Location,Draw_size,Conditions,Surface,Winner_id
0,1877-540,Grand Slam-Wimbledon,1877.07.09,"London, Great Britain",32,Outdoor,Grass,gi91
1,1878-540,Grand Slam-Wimbledon,1878.07.08,"London, Great Britain",64,Outdoor,Grass,hg50
2,1879-540,Grand Slam-Wimbledon,1879.07.07,"London, Great Britain",64,Outdoor,Grass,hg35
3,1880-540,Grand Slam-Wimbledon,1880.07.05,"London, Great Britain",64,Outdoor,Grass,hg35
4,1881-540,Grand Slam-Wimbledon,1881.07.02,"London, Great Britain",64,Outdoor,Grass,rg71


### <a name="match-score-data"></a> Match Score Data

In [88]:
# finds informations and puts it in a df
file = '../../atp-world-tour-tennis-data/csv/2_match_scores/'

# assign path
path, dirs, files = next(os.walk(file))
file_count = len(files)
# create empty list
dataframes_list = []
 
# append datasets to the list
for i in range(file_count):
    temp_df = pd.read_csv(file+files[i], encoding='utf-8', header=None)
    dataframes_list.append(temp_df)

df_W = pd.concat(dataframes_list)
df_L = pd.concat(dataframes_list)
print(len(df_W))
df_W.head()
df_L.head()

223087


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,61 62 64,3.0,0.0,18.0,7.0,0.0,0.0,1877-540-NULL-4-1-gi91-mu62,
1,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,62 65 62,3.0,0.0,18.0,9.0,0.0,0.0,1877-540-NULL-3-1-gi91-hg42,
2,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,63 62 56 61,3.0,1.0,23.0,12.0,0.0,0.0,1877-540-NULL-2-3-gi91-lh23,
3,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,63 63 65,3.0,0.0,18.0,11.0,0.0,0.0,1877-540-NULL-2-2-hg42-mu66,
4,1877-540,1,Wimbledon,wimbledon,/en/scores/archive/wimbledon/540/1877/results,1877.07.09,1877.0,7.0,9.0,1877.07.19,...,,65 56 64 61,3.0,1.0,23.0,16.0,0.0,0.0,1877-540-NULL-2-1-mu62-e994,


In [22]:
df.iloc[0].tolist()

['1877-540',
 1,
 'Wimbledon',
 'wimbledon',
 '/en/scores/archive/wimbledon/540/1877/results',
 '1877.07.09',
 1877.0,
 7.0,
 9.0,
 '1877.07.19',
 1877.0,
 7.0,
 19.0,
 'USD',
 0.0,
 nan,
 'Finals',
 1,
 1,
 'Spencer Gore',
 'gi91',
 'spencer-gore',
 'William Marshall',
 'mu62',
 'william-marshall',
 nan,
 nan,
 '61 62 64',
 3.0,
 0.0,
 18.0,
 7.0,
 0.0,
 0.0,
 '1877-540-NULL-4-1-gi91-mu62',
 nan]

In [89]:
# dropping un needed winner stats
df_W.drop(columns=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,23,24,25,26,28,29,30,31,32,33], inplace=True)
df_W.head()

Unnamed: 0,0,16,20,27,34,35
0,1877-540,Finals,gi91,61 62 64,1877-540-NULL-4-1-gi91-mu62,
1,1877-540,Semi-Finals,gi91,62 65 62,1877-540-NULL-3-1-gi91-hg42,
2,1877-540,Quarter-Finals,gi91,63 62 56 61,1877-540-NULL-2-3-gi91-lh23,
3,1877-540,Quarter-Finals,hg42,63 63 65,1877-540-NULL-2-2-hg42-mu66,
4,1877-540,Quarter-Finals,mu62,65 56 64 61,1877-540-NULL-2-1-mu62-e994,


In [90]:
# Dropping un neededd Loser stats
df_L.drop(columns = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,20,21,22,24,25,26,28,29,30,31,32,33,35], inplace=True)
df_L.head()

Unnamed: 0,0,16,23,27,34
0,1877-540,Finals,mu62,61 62 64,1877-540-NULL-4-1-gi91-mu62
1,1877-540,Semi-Finals,hg42,62 65 62,1877-540-NULL-3-1-gi91-hg42
2,1877-540,Quarter-Finals,lh23,63 62 56 61,1877-540-NULL-2-3-gi91-lh23
3,1877-540,Quarter-Finals,mu66,63 63 65,1877-540-NULL-2-2-hg42-mu66
4,1877-540,Quarter-Finals,e994,65 56 64 61,1877-540-NULL-2-1-mu62-e994


In [91]:
# reorder columns to match headers
df_W =df_W.reindex(columns = [34,0,20,16,27,5])
df_L=df_L.reindex(columns=[34,0,23,16,27,5])
df_W.head()
df_L.head()

Unnamed: 0,34,0,23,16,27,5
0,1877-540-NULL-4-1-gi91-mu62,1877-540,mu62,Finals,61 62 64,
1,1877-540-NULL-3-1-gi91-hg42,1877-540,hg42,Semi-Finals,62 65 62,
2,1877-540-NULL-2-3-gi91-lh23,1877-540,lh23,Quarter-Finals,63 62 56 61,
3,1877-540-NULL-2-2-hg42-mu66,1877-540,mu66,Quarter-Finals,63 63 65,
4,1877-540-NULL-2-1-mu62-e994,1877-540,e994,Quarter-Finals,65 56 64 61,


In [92]:
# setting winner&Loser headers
df_W_Header = pd.read_csv("../temp/Match_score_header_W.csv")
df_L_Header = pd.read_csv("../temp/Match_score_header_L.csv")
print(df_W_Header)

Empty DataFrame
Columns: [Match_id, Tourney_id, Player_id, Round, Score, Player_result]
Index: []


In [93]:
# connecting winner&loser headers
df_W.columns = df_W_Header.columns
df_L.columns = df_L_Header.columns
df_W.head()
df_L.head()

Unnamed: 0,Match_id,Tourney_id,Player_id,Round,Score,Player_result
0,1877-540-NULL-4-1-gi91-mu62,1877-540,mu62,Finals,61 62 64,
1,1877-540-NULL-3-1-gi91-hg42,1877-540,hg42,Semi-Finals,62 65 62,
2,1877-540-NULL-2-3-gi91-lh23,1877-540,lh23,Quarter-Finals,63 62 56 61,
3,1877-540-NULL-2-2-hg42-mu66,1877-540,mu66,Quarter-Finals,63 63 65,
4,1877-540-NULL-2-1-mu62-e994,1877-540,e994,Quarter-Finals,65 56 64 61,


In [94]:
# Setting player_results
df_W['Player_result'] = pd.Series('Won', index=df_L.index)
df_L['Player_result'] = pd.Series('Lost', index=df_L.index)
df_W.head()
df_L.head()

Unnamed: 0,Match_id,Tourney_id,Player_id,Round,Score,Player_result
0,1877-540-NULL-4-1-gi91-mu62,1877-540,mu62,Finals,61 62 64,Lost
1,1877-540-NULL-3-1-gi91-hg42,1877-540,hg42,Semi-Finals,62 65 62,Lost
2,1877-540-NULL-2-3-gi91-lh23,1877-540,lh23,Quarter-Finals,63 62 56 61,Lost
3,1877-540-NULL-2-2-hg42-mu66,1877-540,mu66,Quarter-Finals,63 63 65,Lost
4,1877-540-NULL-2-1-mu62-e994,1877-540,e994,Quarter-Finals,65 56 64 61,Lost


In [1]:
num = len(df)
all_char = []
for i in df['Player_id']:
    char = df['Player_id']
    all_char.append(char)
lengths = [len(s) for s in all_char]
print(len(all_char))
print(sum(lengths))
print(sum(lengths) / len(all_char))

NameError: name 'df' is not defined

### Match Stats Data

In [38]:
pd.options.display.max_columns = 59
# finds informations and puts it in a df
file = '../../atp-world-tour-tennis-data/csv/3_match_stats/'

# assign path
path, dirs, files = next(os.walk(file))
file_count = len(files)
# create empty list
dataframes_list = []
 
# append datasets to the list
for i in range(file_count):
    temp_df = pd.read_csv(file+files[i], encoding='utf-8', header=None)
    dataframes_list.append(temp_df)

df_W = pd.concat(dataframes_list)
df_L = pd.concat(dataframes_list)
print(len(df_W))
df_W.head()
df_L.head()

113898


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
0,1991-7308-MS001-5-1-k181-s351,adelaide,/en/scores/1991/7308/MS001/match-stats?isLive=...,01:29:00,89.0,k181,269.0,1.0,2.0,44.0,65.0,31.0,44.0,12.0,21.0,2.0,5.0,12.0,182.0,16.0,47.0,21.0,36.0,4.0,7.0,12.0,43.0,65.0,37.0,83.0,80.0,148.0,s351,238.0,12.0,5.0,47.0,83.0,31.0,47.0,15.0,36.0,3.0,7.0,12.0,158.0,13.0,44.0,9.0,21.0,3.0,5.0,12.0,46.0,83.0,22.0,65.0,68.0,148.0
1,1991-7308-MS003-4-2-s351-c243,adelaide,/en/scores/1991/7308/MS003/match-stats?isLive=...,01:44:00,104.0,s351,272.0,7.0,4.0,37.0,80.0,26.0,37.0,27.0,43.0,5.0,6.0,11.0,155.0,11.0,40.0,13.0,31.0,2.0,3.0,11.0,53.0,80.0,24.0,71.0,77.0,151.0,c243,268.0,2.0,2.0,40.0,71.0,29.0,40.0,18.0,31.0,1.0,3.0,11.0,93.0,11.0,37.0,16.0,43.0,1.0,6.0,11.0,47.0,71.0,27.0,80.0,74.0,151.0
2,1991-7308-MS002-4-1-k181-l206,adelaide,/en/scores/1991/7308/MS002/match-stats?isLive=...,01:20:00,80.0,k181,281.0,3.0,0.0,0.0,65.0,29.0,46.0,12.0,19.0,0.0,2.0,11.0,187.0,12.0,36.0,20.0,33.0,4.0,7.0,11.0,41.0,65.0,32.0,69.0,73.0,134.0,l206,222.0,1.0,0.0,36.0,69.0,24.0,36.0,13.0,33.0,3.0,7.0,11.0,192.0,17.0,46.0,7.0,19.0,2.0,2.0,11.0,37.0,69.0,24.0,65.0,61.0,134.0
3,1991-7308-MS007-3-4-c243-s367,adelaide,/en/scores/1991/7308/MS007/match-stats?isLive=...,01:20:00,80.0,c243,306.0,5.0,2.0,41.0,61.0,31.0,41.0,14.0,20.0,0.0,1.0,11.0,161.0,6.0,34.0,15.0,27.0,2.0,3.0,10.0,45.0,61.0,21.0,61.0,66.0,122.0,s367,258.0,1.0,5.0,34.0,61.0,28.0,34.0,12.0,27.0,1.0,3.0,10.0,163.0,10.0,41.0,6.0,20.0,1.0,1.0,11.0,40.0,61.0,16.0,61.0,56.0,122.0
4,1991-7308-MS006-3-3-s351-a031,adelaide,/en/scores/1991/7308/MS006/match-stats?isLive=...,01:30:00,90.0,s351,287.0,5.0,2.0,45.0,74.0,34.0,45.0,18.0,29.0,2.0,4.0,14.0,148.0,18.0,57.0,12.0,24.0,3.0,7.0,13.0,52.0,74.0,30.0,81.0,82.0,155.0,a031,264.0,2.0,2.0,57.0,81.0,39.0,57.0,12.0,24.0,4.0,7.0,13.0,126.0,11.0,45.0,11.0,29.0,2.0,4.0,14.0,51.0,81.0,22.0,74.0,73.0,155.0


In [39]:
df_W.drop(columns=[1,4,6,11,12,18,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58], inplace=True)
df_L.drop(columns=[1,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,38,39,45], inplace=True)
df_W[0] = pd.Series(df_W[0].astype(str).str.split('-',2).str[:2].str.join('-'))
df_W[0] = pd.Series(df_W[0].astype(str) + '-' + df_W[5].astype(str) + '-' + df_W[32].astype(str))
df_L[0] = pd.Series(df_L[0].astype(str).str.split('-',2).str[:2].str.join('-'))
df_L[0] = pd.Series(df_L[0].astype(str) + '-' + df_L[5].astype(str) + '-' + df_L[32].astype(str))


year_pattern = re.compile(r'(?:20(20|21|22|23))')
df_L[0] = df_L[0].apply(lambda x: x.split('-')[0] +'-' if year_pattern.search(str(x)) else x)
df_L[2] = df_L[2].apply(lambda x: x.split('/')[6] if year_pattern.search(str(x)) else '')
df_L.loc[df_L[0].str.contains('2022-'), 0] = df_L.loc[df_L[0].str.contains('2022-'), 0].astype(str) + df_L.loc[df_L[0].str.contains('2022-'), 2].astype(str) + '-' + df_L.loc[df_L[0].str.contains('2022-'), 5].astype(str) + '-' + df_L.loc[df_L[0].str.contains('2022-'), 32].astype(str)
df_L.loc[df_L[0].str.contains('2021-'), 0] = df_L.loc[df_L[0].str.contains('2021-'), 0].astype(str) + df_L.loc[df_L[0].str.contains('2021-'), 2].astype(str) + '-' + df_L.loc[df_L[0].str.contains('2021-'), 5].astype(str) + '-' + df_L.loc[df_L[0].str.contains('2021-'), 32].astype(str)
df_L.loc[df_L[0].str.contains('2020-'), 0] = df_L.loc[df_L[0].str.contains('2020-'), 0].astype(str) + df_L.loc[df_L[0].str.contains('2020-'), 2].astype(str) + '-' + df_L.loc[df_L[0].str.contains('2020-'), 5].astype(str) + '-' + df_L.loc[df_L[0].str.contains('2020-'), 32].astype(str)
df_W[0] = df_W[0].apply(lambda x: x.split('-')[0] +'-' if year_pattern.search(str(x)) else x)
df_W[2] = df_W[2].apply(lambda x: x.split('/')[6] if year_pattern.search(str(x)) else '')
df_W.loc[df_W[0].str.contains('2022-'), 0] = df_W.loc[df_W[0].str.contains('2022-'), 0].astype(str) + df_W.loc[df_W[0].str.contains('2022-'), 2].astype(str) + '-' + df_W.loc[df_L[0].str.contains('2022-'), 5].astype(str) + '-' + df_W.loc[df_W[0].str.contains('2022-'), 32].astype(str)
df_W.loc[df_W[0].str.contains('2021-'), 0] = df_W.loc[df_W[0].str.contains('2021-'), 0].astype(str) + df_W.loc[df_W[0].str.contains('2021-'), 2].astype(str) + '-' + df_W.loc[df_L[0].str.contains('2021-'), 5].astype(str) + '-' + df_W.loc[df_W[0].str.contains('2021-'), 32].astype(str)
df_W.loc[df_W[0].str.contains('2020-'), 0] = df_W.loc[df_W[0].str.contains('2020-'), 0].astype(str) + df_W.loc[df_W[0].str.contains('2020-'), 2].astype(str) + '-' + df_W.loc[df_L[0].str.contains('2020-'), 5].astype(str) + '-' + df_W.loc[df_W[0].str.contains('2020-'), 32].astype(str)
df_W.drop(columns=[32,2], inplace=True)
df_L.drop(columns=[5,2], inplace=True)
df_L

  df_W[0] = pd.Series(df_W[0].astype(str).str.split('-',2).str[:2].str.join('-'))
  df_L[0] = pd.Series(df_L[0].astype(str).str.split('-',2).str[:2].str.join('-'))


Unnamed: 0,0,3,32,34,35,36,37,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58
0,1991-7308-k181-s351,01:29:00,s351,12.0,5.0,47.0,83.0,15.0,36.0,3.0,7.0,12.0,13.0,44.0,9.0,21.0,3.0,5.0,12.0,46.0,83.0,22.0,65.0,68.0,148.0
1,1991-7308-s351-c243,01:44:00,c243,2.0,2.0,40.0,71.0,18.0,31.0,1.0,3.0,11.0,11.0,37.0,16.0,43.0,1.0,6.0,11.0,47.0,71.0,27.0,80.0,74.0,151.0
2,1991-7308-k181-l206,01:20:00,l206,1.0,0.0,36.0,69.0,13.0,33.0,3.0,7.0,11.0,17.0,46.0,7.0,19.0,2.0,2.0,11.0,37.0,69.0,24.0,65.0,61.0,134.0
3,1991-7308-c243-s367,01:20:00,s367,1.0,5.0,34.0,61.0,12.0,27.0,1.0,3.0,10.0,10.0,41.0,6.0,20.0,1.0,1.0,11.0,40.0,61.0,16.0,61.0,56.0,122.0
4,1991-7308-s351-a031,01:30:00,a031,2.0,2.0,57.0,81.0,12.0,24.0,4.0,7.0,13.0,11.0,45.0,11.0,29.0,2.0,4.0,14.0,51.0,81.0,22.0,74.0,73.0,155.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3929,2022-605-rh16-fb98,02:12:00,fb98,,,,,,,,,,,,,,,,,,,,,,
3930,2022-605-d643-te51,01:38:00,te51,,,,,,,,,,,,,,,,,,,,,,
3931,2022-605-re44-mm58,02:31:00,mm58,,,,,,,,,,,,,,,,,,,,,,
3932,2022-605-fb98-n409,01:37:00,n409,,,,,,,,,,,,,,,,,,,,,,


In [40]:
df_W =df_W.reindex(columns = [0,5,3,7,8,9,10,13,14,15,16,26,27,19,20,21,22,23,24,25,17,28,29,30,31])
df_L =df_L.reindex(columns=[0,32,3,34,35,36,37,40,41,42,43,53,54,46,47,48,49,50,51,52,44,55,56,57,58])
df_W_Header = pd.read_csv('../temp/Match_stats_header.csv')
df_W.columns = df_W_Header.columns
df_L.columns = df_W_Header.columns
df = pd.concat([df_W, df_L])
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [49]:
df.iloc[104000:104010]


Unnamed: 0,Match_id,Player_id,Duration,Aces,Double_faults,First_serves_in,First_serves_total,Second_serve_points_won,Second_serve_points_total,Break_points_saved,Break_points_serve_total,Service_points_won,Service_points_total,First_serves_return_won,First_serves_return_total,Second_serves_return_won,Second_serves_return_total,Break_points_converted,Break_point_return_total,Service_games_played,Returns_games_played,Return_points_won,Return_points_total,Total_points_won,Total_points_total
104000,2019-605-z355-n409,z355,01:24:00,11.0,2.0,34.0,49.0,7.0,15.0,0.0,0.0,37.0,49.0,14.0,37.0,13.0,23.0,3.0,4.0,9.0,9.0,27.0,60.0,64.0,109.0
104001,2019-605-te51-mm58,te51,01:42:00,5.0,0.0,44.0,73.0,12.0,29.0,0.0,0.0,51.0,73.0,15.0,53.0,11.0,24.0,1.0,4.0,11.0,11.0,26.0,77.0,77.0,150.0
104002,2019-605-d643-bk40,d643,01:04:00,4.0,1.0,24.0,40.0,12.0,16.0,0.0,1.0,30.0,40.0,18.0,34.0,10.0,14.0,5.0,7.0,7.0,8.0,28.0,48.0,58.0,88.0
104003,2019-605-tb69-f324,tb69,01:40:00,1.0,1.0,51.0,81.0,19.0,30.0,3.0,4.0,52.0,81.0,14.0,47.0,11.0,21.0,3.0,5.0,12.0,12.0,25.0,68.0,77.0,149.0
104004,2020-451-re44-mw02,re44,01:27:00,3.0,0.0,44.0,58.0,6.0,14.0,1.0,2.0,40.0,58.0,22.0,48.0,7.0,14.0,3.0,6.0,10.0,10.0,29.0,62.0,69.0,120.0
104005,2020-451-mw02-w367,mw02,02:01:00,0.0,0.0,56.0,74.0,12.0,18.0,0.0,1.0,57.0,74.0,16.0,61.0,15.0,32.0,2.0,11.0,15.0,15.0,31.0,93.0,88.0,167.0
104006,2020-451-re44-ki95,re44,00:53:00,10.0,1.0,26.0,43.0,10.0,17.0,0.0,0.0,34.0,43.0,16.0,30.0,10.0,20.0,4.0,6.0,8.0,8.0,26.0,50.0,60.0,93.0
104007,2020-451-w367-bh09,w367,01:24:00,5.0,0.0,29.0,58.0,16.0,29.0,2.0,2.0,43.0,58.0,3.0,25.0,14.0,24.0,2.0,4.0,9.0,10.0,17.0,49.0,60.0,107.0
104008,2020-451-re44-h996,re44,01:41:00,4.0,1.0,40.0,72.0,15.0,32.0,5.0,6.0,44.0,72.0,12.0,40.0,24.0,34.0,4.0,8.0,10.0,9.0,36.0,74.0,80.0,146.0
104009,2020-451-ki95-f724,ki95,00:56:00,2.0,0.0,30.0,39.0,6.0,9.0,3.0,4.0,28.0,39.0,18.0,32.0,13.0,16.0,6.0,12.0,7.0,7.0,31.0,48.0,59.0,87.0


## Working code