# Thomas Partey Statistical Profile

In [36]:
# Goals of this project

# Data Cleaning Part
    #1. Create and clean a database of the CDM's from the top 5 Leagues.
    #2. Database should include the defensive, passing, and misc stats
    #3. Also obtain the previous years match by match data of Thomas Partey
    #4. Final data should be as follows:
        #a. cdm_database.csv
        #b. thomas_partey.csv

# Things to plot/find/visualize
    #1. Scatter Plots:
        #a. Weights vs Total of All CDM's
        #b. Pure Passing Scatter( Accuracy vs Location of Passes)
        #c. Pure Defensive Scatter
        
    #2. Bar Plots:
        #a. Extremely Basic Stats
        #b. Year by Year Improvement of Partey (Defensive + Passing Stats)
        #c. Consistency Evaluation (Box/Violin Plots)
        
    #3. Line Plots:
        #a. Consistency Evaluation (Alternate)

In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

%matplotlib notebook

## Data Cleaning

In [38]:
#Defining a few variables

#Folders
DEF_STATS = 'def_stats'
PASS_STATS = 'pass_stats'
MISC_STATS = 'misc_stats'

#Leagues
LIGUE1 = 'france'
BUNDESLIGA = 'germany'
EPL = 'england'
LALIGA = 'spain'
SERIEA = 'italy'

#CDM Database
CDM_DB = 'cdm_database'

COLUMNS = [
    
    #Def stats
    'Goals',
    'Dribbled Past',
    'Tackles',
    'Errors Leading to Shots',
    'Interceptions',
    'Blocked Shots',
    
    #Pass stats
    'Accurate Pass %',
    'Accurate Passes in Own Half',
    'Accurate Passes in Opposition Half',
    'Accurate Passes in Final Third',
    'Key Passes',
    'Accurate Long Ball %',
    
    #Misc stats
    'Ground Duels %Won',
    'Aerial Duels %Won',
    'Minutes Played',
    'Was Fouled',
    'Dispossessed',
    'Matches Started',
    
]

In [39]:
#Obtaining statistics of all cdm's into one dataframe by defining functions

def get_stats(stat_name):
    #Simple method to list all files and return as dataframes
    
    stat_files = os.listdir(stat_name)
    return [pd.read_table(stat_name +'/' + file, encoding = "ISO-8859-1", header = None) for file in stat_files]

def create_dataframe(defensive, passing, misc):
    
    all_data = []
    for i in range(len(defensive)):
        
        #Find minimum length
        n = min(len(defensive[i]), len(passing[i]), len(misc[i]))
        
        def_df = defensive[i].iloc[:n].set_index([0, 1, 2])
        pas_df = passing[i].iloc[:n].set_index([0, 1, 2])
        mis_df = misc[i].iloc[:n].set_index([0, 1, 2])
        
        all_stats = pd.concat([def_df, pas_df, mis_df], axis = 'columns')
        all_stats.columns = COLUMNS
        all_data = all_data + [all_stats]
        
    all_data =  pd.concat(all_data)
    all_data = all_data.reset_index().drop([0, 1], axis = 'columns')
    all_data = all_data.rename({2:'Name'}, axis = 'columns')
    all_data = all_data.set_index(['Name'])
    return all_data

def fifa_df():
    
    STR_PERM = [
        'CM',
        'CDM',
        'CMCDM',
        'CDMCM'
    ]

    cols = [
        'Name',
        'Nationality',
        'Age',
        'Club'
    ]

    df = pd.read_table(CDM_DB +'.txt', encoding = "ISO-8859-1", header = None)
    #df = df[df[4].isin(STR_PERM)]
    df = df.drop([2, 3, 4, 6], axis = 'columns')
    df.columns = cols
    df = df.drop_duplicates()
    return df

In [40]:
#Calling the defined functions

def_stats = get_stats(DEF_STATS)
pass_stats = get_stats(PASS_STATS)
misc_stats = get_stats(MISC_STATS)

all_data = create_dataframe(def_stats, pass_stats, misc_stats)
cdm_df = fifa_df()

#Combing our stats for midfielders + list of CDM's
cdm_stats = cdm_df.join(all_data, on = 'Name', how = 'inner')
cdm_stats.to_csv(CDM_DB + '.csv')

#CDM database finalized
cdm_stats.head()

Unnamed: 0,Name,Nationality,Age,Club,Goals,Dribbled Past,Tackles,Errors Leading to Shots,Interceptions,Blocked Shots,...,Accurate Passes in Opposition Half,Accurate Passes in Final Third,Key Passes,Accurate Long Ball %,Ground Duels %Won,Aerial Duels %Won,Minutes Played,Was Fouled,Dispossessed,Matches Started
0,N'Golo Kanté,France,28,Chelsea,4.0,45.0,74.0,1.0,44.0,6.0,...,1164.0,653.0,46.0,73.0,43.0,40.0,3095.0,13.0,54.0,36.0
1,Sergio Busquets,Spain,30,FC Barcelona,0.0,48.0,90.0,1.0,54.0,8.0,...,1173.0,468.0,15.0,79.0,57.0,50.0,2719.0,54.0,25.0,30.0
2,Casemiro,Brazil,27,Real Madrid,3.0,56.0,86.0,2.0,41.0,12.0,...,669.0,223.0,14.0,75.0,55.0,62.0,2316.0,49.0,17.0,27.0
4,Ivan Rakitic,Croatia,31,FC Barcelona,3.0,25.0,30.0,0.0,51.0,8.0,...,1365.0,621.0,17.0,77.0,55.0,65.0,2643.0,53.0,23.0,29.0
5,Fernandinho,Brazil,34,Manchester City,1.0,21.0,57.0,0.0,41.0,4.0,...,1090.0,465.0,23.0,59.0,56.0,54.0,2381.0,26.0,13.0,27.0


In [41]:
cdm_stats.sort_values(by = 'Aerial Duels %Won', ascending = False)

Unnamed: 0,Name,Nationality,Age,Club,Goals,Dribbled Past,Tackles,Errors Leading to Shots,Interceptions,Blocked Shots,...,Accurate Passes in Opposition Half,Accurate Passes in Final Third,Key Passes,Accurate Long Ball %,Ground Duels %Won,Aerial Duels %Won,Minutes Played,Was Fouled,Dispossessed,Matches Started
117,Jack Wilshere,England,27,West Ham,0.0,4.0,7.0,0.0,7.0,1.0,...,115.0,46.0,5.0,79.0,64.0,100.0,395.0,12.0,7.0,4.0
517,Steven Davis,Northern Ireland,34,Rangers,0.0,0.0,2.0,0.0,1.0,0.0,...,38.0,17.0,3.0,80.0,50.0,100.0,97.0,0.0,0.0,1.0
24,Corentin Tolisso,France,24,FC Bayern,1.0,2.0,0.0,0.0,0.0,1.0,...,23.0,9.0,2.0,67.0,33.0,100.0,52.0,1.0,0.0,2.0
476,Federico Viviani,Italy,27,Frosinone,0.0,5.0,6.0,0.0,8.0,3.0,...,88.0,34.0,3.0,60.0,50.0,80.0,341.0,5.0,4.0,5.0
346,Sanjin Prcic,Bosnia Herzegovina,25,RC Strasbourg,0.0,6.0,18.0,0.0,11.0,7.0,...,131.0,56.0,1.0,66.0,63.0,76.0,533.0,10.0,3.0,6.0
448,Philip Billing,Denmark,22,Huddersfield,2.0,24.0,74.0,0.0,53.0,16.0,...,454.0,218.0,23.0,59.0,54.0,75.0,2190.0,30.0,29.0,25.0
511,Dennis Aogo,Germany,32,VfB Stuttgart,0.0,9.0,15.0,0.0,8.0,1.0,...,186.0,56.0,15.0,55.0,50.0,73.0,1027.0,6.0,7.0,15.0
38,William Carvalho,Portugal,27,Real Betis,0.0,13.0,45.0,0.0,12.0,0.0,...,963.0,390.0,18.0,74.0,59.0,70.0,2386.0,35.0,35.0,26.0
25,Adrien Rabiot,France,24,Paris,2.0,3.0,35.0,0.0,10.0,2.0,...,582.0,215.0,9.0,82.0,60.0,69.0,1064.0,8.0,9.0,12.0
465,Lucas Torró,Spain,24,Frankfurt,0.0,6.0,14.0,0.0,14.0,1.0,...,99.0,34.0,2.0,44.0,55.0,68.0,560.0,7.0,5.0,8.0


In [60]:
cdm_stats[['Name', 'Club', COLUMNS[17]]]

Unnamed: 0,Name,Club,Matches Started
0,N'Golo Kanté,Chelsea,36.0
1,Sergio Busquets,FC Barcelona,30.0
2,Casemiro,Real Madrid,27.0
4,Ivan Rakitic,FC Barcelona,29.0
5,Fernandinho,Manchester City,27.0
7,Marco Verratti,Paris,23.0
8,Miralem Pjanic,Juventus,27.0
9,Fabinho,Liverpool,21.0
12,Arturo Vidal,FC Barcelona,22.0
13,Axel Witsel,Dortmund,33.0
