In [63]:
# Importing Dependencies
import pandas as pd
import regex as re
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime


In [64]:
# Converting CSV files into Pandas dataframes
df_player = pd.read_csv('Resources/players.csv')
df_player_att = pd.read_csv('Resources/players_attributes.csv')
df_fifa_players = pd.read_csv('Resources/fifa_players.csv')

In [65]:
# Defining function to extract the last name from dataframes to have a common element for merging. 
def extract_last_name(name):
    # Use regex to find the last word in the string, which is assumed to be the last name
    last_name = re.search(r'(\w+)$', name)
    
    # Return the last name if a match is found
    if last_name:
        return last_name.group(1)
    else:
        return None

In [66]:
df_fifa_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17954 entries, 0 to 17953
Data columns (total 51 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   name                           17954 non-null  object 
 1   full_name                      17954 non-null  object 
 2   birth_date                     17954 non-null  object 
 3   age                            17954 non-null  int64  
 4   height_cm                      17954 non-null  float64
 5   weight_kgs                     17954 non-null  float64
 6   positions                      17954 non-null  object 
 7   nationality                    17954 non-null  object 
 8   overall_rating                 17954 non-null  int64  
 9   potential                      17954 non-null  int64  
 10  value_euro                     17699 non-null  float64
 11  wage_euro                      17708 non-null  float64
 12  preferred_foot                 17954 non-null 

In [67]:
# This Df is a complete set with progressional growth over the years of each player. 
df_player_comp = pd.merge(df_player,df_player_att, how='left', on='player_api_id') 

# Getting distinct values based on latest date of recorded statistics of each player. 
df_player_comp['date'] = pd.to_datetime(df_player_comp['date'])
latest_indices = df_player_comp.groupby('player_name')['date'].idxmax()
latest_value = df_player_comp.loc[latest_indices]

latest_value['birthday'] = pd.to_datetime(latest_value['birthday'])
# latest_value['birthday'] = latest_value['birthday'].dt.date
latest_value['birthday'] = latest_value['birthday'].dt.strftime('%m/%d/%Y')
                                          

In [68]:
latest_value.head()

Unnamed: 0,id_x,player_api_id,player_name,player_fifa_api_id_x,birthday,height,weight,id_y,player_fifa_api_id_y,date,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,505942,Aaron Appindangoye,218353,02/29/1992,182.88,187,1,218353,2016-02-18,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
5,2,155782,Aaron Cresswell,189615,12/15/1989,170.18,146,6,189615,2016-04-21,...,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
38,3,162549,Aaron Doran,186170,05/13/1991,170.18,163,39,186170,2016-01-07,...,68.0,61.0,23.0,22.0,24.0,16.0,11.0,12.0,9.0,13.0
64,4,30572,Aaron Galindo,140161,05/08/1982,182.88,198,65,140161,2016-04-21,...,54.0,37.0,72.0,71.0,68.0,15.0,12.0,13.0,12.0,11.0
87,5,23780,Aaron Hughes,17725,11/08/1979,182.88,154,88,17725,2015-12-24,...,41.0,45.0,75.0,73.0,71.0,8.0,6.0,16.0,12.0,11.0


In [69]:
# Using function to extract last name using Regex. This way we have a common element in both datasets and can use that to merge. 
latest_value['last_name'] = latest_value['player_name'].apply(extract_last_name)

In [70]:
# Using function to extract last name using Regex. This way we have a common element in both datasets and can use that to merge. 
df_fifa_players['last_name'] = df_fifa_players['name'].apply(extract_last_name)

In [71]:
# Renaming Column for consistency
df_fifa_players.rename(columns= {'birth_date' : 'birthday'}, inplace=True)

In [72]:
df_fifa_players.head()

Unnamed: 0,name,full_name,birthday,age,height_cm,weight_kgs,positions,nationality,overall_rating,potential,...,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,last_name
0,L. Messi,Lionel Andrés Messi Cuccittini,6/24/1987,31,170.18,72.1,"CF,RW,ST",Argentina,94,94,...,48,22,94,94,75,96,33,28,26,Messi
1,C. Eriksen,Christian Dannemann Eriksen,2/14/1992,27,154.94,76.2,"CAM,RM,CM",Denmark,88,89,...,46,56,84,91,67,88,59,57,22,Eriksen
2,P. Pogba,Paul Pogba,3/15/1993,25,190.5,83.9,"CM,CAM",France,88,91,...,78,64,82,88,82,87,63,67,67,Pogba
3,L. Insigne,Lorenzo Insigne,6/4/1991,27,162.56,59.0,"LW,ST",Italy,88,88,...,34,26,83,87,61,83,51,24,22,Insigne
4,K. Koulibaly,Kalidou Koulibaly,6/20/1991,27,187.96,88.9,CB,Senegal,88,91,...,87,88,24,49,33,80,91,88,87,Koulibaly


In [73]:
# combining dataframes to cummalate all relevant columns 
df_combined = pd.merge(latest_value, df_fifa_players, how= 'inner', on=['last_name','birthday'] )
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468 entries, 0 to 467
Data columns (total 99 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   id_x                           468 non-null    int64         
 1   player_api_id                  468 non-null    int64         
 2   player_name                    468 non-null    object        
 3   player_fifa_api_id_x           468 non-null    int64         
 4   birthday                       468 non-null    object        
 5   height                         468 non-null    float64       
 6   weight                         468 non-null    int64         
 7   id_y                           468 non-null    int64         
 8   player_fifa_api_id_y           468 non-null    int64         
 9   date                           468 non-null    datetime64[ns]
 10  overall_rating_x               468 non-null    float64       
 11  potential_x        

In [74]:
# Dropped duplicate values from dataframe
df_combined.drop_duplicates(subset='player_name', inplace=True)


In [75]:
df_combined.columns

Index(['id_x', 'player_api_id', 'player_name', 'player_fifa_api_id_x',
       'birthday', 'height', 'weight', 'id_y', 'player_fifa_api_id_y', 'date',
       'overall_rating_x', 'potential_x', 'preferred_foot_x',
       'attacking_work_rate', 'defensive_work_rate', 'crossing_x',
       'finishing_x', 'heading_accuracy_x', 'short_passing_x', 'volleys_x',
       'dribbling_x', 'curve_x', 'free_kick_accuracy', 'long_passing_x',
       'ball_control_x', 'acceleration_x', 'sprint_speed_x', 'agility_x',
       'reactions_x', 'balance_x', 'shot_power_x', 'jumping_x', 'stamina_x',
       'strength_x', 'long_shots_x', 'aggression_x', 'interceptions_x',
       'positioning_x', 'vision_x', 'penalties_x', 'marking_x',
       'standing_tackle_x', 'sliding_tackle_x', 'gk_diving', 'gk_handling',
       'gk_kicking', 'gk_positioning', 'gk_reflexes', 'last_name', 'name',
       'full_name', 'age', 'height_cm', 'weight_kgs', 'positions',
       'nationality', 'overall_rating_y', 'potential_y', 'value_eur

In [76]:
# Dropping Duplicate and irrelevant columns
df_combined.drop(columns=['player_fifa_api_id_x','weight', 'id_y', 'player_fifa_api_id_y','date','attacking_work_rate', 'defensive_work_rate','last_name', 'name',
       'full_name','height_cm','overall_rating_y', 'potential_y','preferred_foot_y',
       'international_reputation(1-5)', 'weak_foot(1-5)', 'skill_moves(1-5)',
       'body_type', 'release_clause_euro', 'national_team', 'national_rating',
       'national_team_position', 'national_jersey_number', 'crossing_y',
       'finishing_y', 'heading_accuracy_y', 'short_passing_y', 'volleys_y',
       'dribbling_y', 'curve_y', 'freekick_accuracy', 'long_passing_y',
       'ball_control_y', 'acceleration_y', 'sprint_speed_y', 'agility_y',
       'reactions_y', 'balance_y', 'shot_power_y', 'jumping_y', 'stamina_y',
       'strength_y', 'long_shots_y', 'aggression_y', 'interceptions_y',
       'positioning_y', 'vision_y', 'penalties_y', 'composure', 'marking_y',
       'standing_tackle_y', 'sliding_tackle_y','gk_diving',
       'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes'], inplace=True)

In [77]:
# Removed Suffixes ('_x' and '_y') created by the merges 
df_combined.rename(columns=lambda x: x.replace('_x', '').replace('_y', ''), inplace=True)
df_combined.columns

Index(['id', 'player_api_id', 'player_name', 'birthday', 'height',
       'overall_rating', 'potential', 'preferred_foot', 'crossing',
       'finishing', 'heading_accuracy', 'short_passing', 'volleys',
       'dribbling', 'curve', 'free_kick_accuracy', 'long_passing',
       'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions',
       'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots',
       'aggression', 'interceptions', 'positioning', 'vision', 'penalties',
       'marking', 'standing_tackle', 'sliding_tackle', 'age', 'weight_kgs',
       'positions', 'nationality', 'value_euro', 'wage_euro'],
      dtype='object')

In [78]:
# Removed Null values within the dataframe.
df_cleaned = df_combined.dropna(subset=['wage_euro','value_euro','volleys','curve','agility','balance','jumping'])


In [79]:
# final check for null values
df_cleaned.isnull().sum()

id                    0
player_api_id         0
player_name           0
birthday              0
height                0
overall_rating        0
potential             0
preferred_foot        0
crossing              0
finishing             0
heading_accuracy      0
short_passing         0
volleys               0
dribbling             0
curve                 0
free_kick_accuracy    0
long_passing          0
ball_control          0
acceleration          0
sprint_speed          0
agility               0
reactions             0
balance               0
shot_power            0
jumping               0
stamina               0
strength              0
long_shots            0
aggression            0
interceptions         0
positioning           0
vision                0
penalties             0
marking               0
standing_tackle       0
sliding_tackle        0
age                   0
weight_kgs            0
positions             0
nationality           0
value_euro            0
wage_euro       

In [80]:
# Creating Dataframe with player info. 
df_player_info = df_cleaned[['id','player_name','birthday','height','weight_kgs',
       'overall_rating','potential','nationality','value_euro','wage_euro']]

# Creating age column as per birthday. 
current_date = pd.Timestamp(datetime.now())
df_player_info['birthday'] = pd.to_datetime(df_player_info['birthday'])
df_player_info['age'] = current_date.year - df_player_info['birthday'].dt.year

# Filtering column to have players most preferred position. 
df_player_info['position'] = df_cleaned['positions'].str.extract(r'^([^,]+)')
df_player_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 462 entries, 0 to 467
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              462 non-null    int64         
 1   player_name     462 non-null    object        
 2   birthday        462 non-null    datetime64[ns]
 3   height          462 non-null    float64       
 4   weight_kgs      462 non-null    float64       
 5   overall_rating  462 non-null    float64       
 6   potential       462 non-null    float64       
 7   nationality     462 non-null    object        
 8   value_euro      462 non-null    float64       
 9   wage_euro       462 non-null    float64       
 10  age             462 non-null    int32         
 11  position        462 non-null    object        
dtypes: datetime64[ns](1), float64(6), int32(1), int64(1), object(3)
memory usage: 45.1+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_info['birthday'] = pd.to_datetime(df_player_info['birthday'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_info['age'] = current_date.year - df_player_info['birthday'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_info['position'] = df_cleaned['positions'].s

In [81]:
# Converting Birthday Colum to datetime format. 
df_cleaned['birthday'] = df_cleaned['birthday'].astype('datetime64[ns]')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['birthday'] = df_cleaned['birthday'].astype('datetime64[ns]')


In [82]:
# Filtering Dataframe to contain information of Defensive players
df_defensive = df_cleaned[df_cleaned['positions'].str.contains('LB|RB|CB|LWB|RWB')]

# Filtering Dataframe to keep columns relevant to Defenders
df_defensive = df_defensive[['id','aggression', 'interceptions', 
                             'positioning','marking','long_shots', 'standing_tackle','short_passing','long_passing', 'sliding_tackle']]
df_defensive.head()

Unnamed: 0,id,aggression,interceptions,positioning,marking,long_shots,standing_tackle,short_passing,long_passing,sliding_tackle
0,2,68.0,67.0,60.0,76.0,62.0,75.0,71.0,68.0,78.0
1,10,73.0,67.0,65.0,67.0,61.0,68.0,70.0,53.0,65.0
4,58,80.0,65.0,30.0,61.0,39.0,63.0,41.0,43.0,57.0
5,97,62.0,77.0,57.0,60.0,33.0,64.0,59.0,59.0,64.0
8,148,82.0,81.0,24.0,81.0,58.0,84.0,58.0,64.0,80.0


In [83]:
# Filter Dataframe to contain information of Attacking Players
df_attackers = df_cleaned[df_cleaned['positions'].str.contains('CM|CAM|RM|LM|ST|CF|\bLW\b|\bRW\b')]

# Filtering Dataframe to keep columns relevant to Attackers
df_attackers = df_attackers[['id','crossing','finishing', 'heading_accuracy', 'short_passing', 'volleys',
       'dribbling', 'curve', 'free_kick_accuracy', 'long_passing',
       'ball_control','shot_power', 'long_shots']]
df_attackers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, 2 to 467
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  263 non-null    int64  
 1   crossing            263 non-null    float64
 2   finishing           263 non-null    float64
 3   heading_accuracy    263 non-null    float64
 4   short_passing       263 non-null    float64
 5   volleys             263 non-null    float64
 6   dribbling           263 non-null    float64
 7   curve               263 non-null    float64
 8   free_kick_accuracy  263 non-null    float64
 9   long_passing        263 non-null    float64
 10  ball_control        263 non-null    float64
 11  shot_power          263 non-null    float64
 12  long_shots          263 non-null    float64
dtypes: float64(12), int64(1)
memory usage: 28.8 KB


In [84]:
#Filtering Data to contain information of physical attributes
df_physical = df_cleaned[['id','reactions','balance','jumping', 'stamina', 'strength','aggression']]

df_physical.info()

<class 'pandas.core.frame.DataFrame'>
Index: 462 entries, 0 to 467
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          462 non-null    int64  
 1   reactions   462 non-null    float64
 2   balance     462 non-null    float64
 3   jumping     462 non-null    float64
 4   stamina     462 non-null    float64
 5   strength    462 non-null    float64
 6   aggression  462 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 28.9 KB


In [85]:
# Filter the DataFrame to contain data for goal keepers only
df_gk = df_cleaned[df_cleaned['positions'].str.contains('GK')]

# Filtering Dataframe to keep columns relevant to Goalkeepers
df_gk = df_gk[['id', 'agility','positioning', 'reactions', 'jumping', 'strength',
       'balance','stamina']]
df_gk.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29 entries, 22 to 458
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           29 non-null     int64  
 1   agility      29 non-null     float64
 2   positioning  29 non-null     float64
 3   reactions    29 non-null     float64
 4   jumping      29 non-null     float64
 5   strength     29 non-null     float64
 6   balance      29 non-null     float64
 7   stamina      29 non-null     float64
dtypes: float64(7), int64(1)
memory usage: 2.0 KB


In [86]:
column_toplot = df_player_info[['age', 'overall_rating', 'potential','value_euro' ,'wage_euro']]

sns.set_style("whitegrid")
sns.set_context("talk")
palette = sns.color_palette("coolwarm", as_cmap=True)
sns.pairplot(data=column_toplot, 
             hue="value_euro",
             kind='scatter',  
             diag_kind='auto',  
             height=2, 
             aspect=1.2,
            )
plt.show()

  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)


In [None]:
#exporting Dataframes to Csv
df_cleaned.to_csv('Resources/all_player_data.csv', index=False)
df_player_info.to_csv('Resources/player_info.csv', index=False)
df_gk.to_csv('Resources/goalkeeper_data.csv', index=False)
df_attackers.to_csv('Resources/attackers_data.csv', index=False)
df_defensive.to_csv('Resources/defensive_data.csv', index=False)
df_physical.to_csv('Resources/physical_data.csv', index=False)
