### Data Merging

In [11]:
import pandas as pd

injuries_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\\player\\player_injuries\\player_injuries.csv")
latest_market_value_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\\player\\player_latest_market_value\\player_latest_market_value.csv")
market_value_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\\player\\player_market_value\\player_market_value.csv")
national_perf_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\\player\\player_national_performances\\player_national_performances.csv")
performances_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\\player\\player_performances\\player_performances.csv")
profiles_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\\player\\player_profiles\\player_profiles.csv", low_memory=False)
teammates_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\\player\\player_teammates_played_with\\player_teammates_played_with.csv")


team_children_df = pd.read_csv("C:\\Users\hp\\Desktop\\player dataset\\player\\team_children\\team_children.csv")
team_comp_season_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\\player\\team_competitions_seasons\\team_competitions_seasons.csv")
team_details_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\\player\\team_details\\team_details.csv")
transfer_history_df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\player\\transfer_history\\transfer_history.csv")

# Sentiment / tweets data
tweets_df = pd.read_csv("C:\\Users\\hp\\Desktop\\Info Datasets\\tweets_premier_league_footballers_cleaned.csv", encoding='latin1')


In [12]:
for df_name in [profiles_df, latest_market_value_df, market_value_df, performances_df, national_perf_df, injuries_df, tweets_df]:
    if 'player_id' in df_name.columns:
        df_name['player_id'] = pd.to_numeric(df_name['player_id'], errors='coerce').astype('Int64')

In [13]:
merged_df = profiles_df.copy()
print(f"Initial profiles_df shape: {merged_df.shape}")

Initial profiles_df shape: (92671, 34)


In [14]:
profiles_df['clean_player_name'] = profiles_df['player_name'].str.split('(', n=1).str[0].str.strip().str.upper()

In [15]:
# Merge market value 
merged_df = merged_df.merge(
    latest_market_value_df.drop(columns=['player_name'], errors='ignore'),
    on='player_id',
    how='inner'
)

In [16]:
merged_df.shape

(69441, 36)

In [17]:
merged_df = merged_df.merge(
    market_value_df.drop(columns=['player_name'], errors='ignore'),
    on='player_id',
    how='inner'
)

In [18]:
merged_df.shape

(901429, 38)

In [None]:
# Merge performances
merged_df = merged_df.merge(
    performances_df,
    on='player_id',
    how='inner'
)

# Merge national performances
merged_df = merged_df.merge(
    national_perf_df,
    on='player_id',
    how='inner'
)

In [None]:
# Merge injuries
merged_df = merged_df.merge(
    injuries_df,
    on='player_id',
    how='inner'
)

In [None]:
player_name_to_id = profiles_df[['clean_player_name', 'player_id']].drop_duplicates()
tweets_df['player_name_upper'] = tweets_df['player_name'].str.upper()
tweets_agg = tweets_df.drop(columns=['text', 'player_name'], errors='ignore').groupby('player_name_upper').mean(numeric_only=True).reset_index()

tweets_with_id = tweets_agg.merge(player_name_to_id, left_on='player_name_upper', right_on='clean_player_name', how='left')

# Drop redundant player name columns before final merge
tweets_with_id = tweets_with_id.drop(columns=['player_name_upper', 'clean_player_name'], errors='ignore')

# Merge sentiment/tweets data
merged_df = merged_df.merge(
    tweets_with_id,
    on='player_id',
    how='inner'
)
print(f"After tweets_with_id merge: {merged_df.shape}")

merged_df.to_csv("master_file.csv", index=False)

### data preprocessing

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from datetime import datetime

In [5]:
df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\master_file.csv")

  df = pd.read_csv("C:\\Users\\hp\\Desktop\\player dataset\master_file.csv")


In [6]:
df.shape

(75731, 74)

In [7]:
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], format='%d-%m-%Y', errors='coerce')


def calculate_age(born):
    if pd.isnull(born):
        return np.nan
    today = datetime.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

# Apply the function to create a new feature
df['age'] = df['date_of_birth'].apply(calculate_age)


cols_to_drop = ['player_id', 'player_slug', 'player_name', 'player_image_url', 'date_of_birth', 'social_media_url']
df = df.drop(columns=cols_to_drop, errors='ignore')

print("Feature Engineering (Age) completed.")

Feature Engineering (Age) completed.


In [8]:
df.head()

Unnamed: 0,name_in_home_country,place_of_birth,country_of_birth,height,citizenship,is_eu,position,main_position,foot,current_club_id,...,career_state,season_name_y,injury_reason,from_date,end_date,days_missed,games_missed,vader_polarity,tb_polarity,age
0,Joel Robles Blázquez,Getafe,Spain,194,Spain,True,Goalkeeper,Goalkeeper,right,1465,...,FORMER_NATIONAL_PLAYER,24/25,unknown injury,09-03-2025,12-04-2025,35,4,0.165895,0.053446,35
1,Joel Robles Blázquez,Getafe,Spain,194,Spain,True,Goalkeeper,Goalkeeper,right,1465,...,FORMER_NATIONAL_PLAYER,24/25,unknown injury,09-03-2025,12-04-2025,35,4,0.165895,0.053446,35
2,Joel Robles Blázquez,Getafe,Spain,194,Spain,True,Goalkeeper,Goalkeeper,right,1465,...,FORMER_NATIONAL_PLAYER,24/25,unknown injury,09-03-2025,12-04-2025,35,4,0.165895,0.053446,35
3,Joel Robles Blázquez,Getafe,Spain,194,Spain,True,Goalkeeper,Goalkeeper,right,1465,...,FORMER_NATIONAL_PLAYER,24/25,unknown injury,09-03-2025,12-04-2025,35,4,0.165895,0.053446,35
4,Joel Robles Blázquez,Getafe,Spain,194,Spain,True,Goalkeeper,Goalkeeper,right,1465,...,FORMER_NATIONAL_PLAYER,24/25,unknown injury,09-03-2025,12-04-2025,35,4,0.165895,0.053446,35


In [9]:
numeric_features = ['height', 'matches', 'goals_x', 'assists', 'minutes_played', 'age', 'value_x']

nominal_features = ['position', 'foot', 'citizenship', 'current_club_name']

ordinal_features = ['competition_name']

numeric_features = [c for c in numeric_features if c in df.columns]
nominal_features = [c for c in nominal_features if c in df.columns]
ordinal_features = [c for c in ordinal_features if c in df.columns]

In [10]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) 
    # Note: use sparse=False for sklearn versions < 1.2
])


ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [11]:
# Combine all transformers into a single Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('nom', nominal_transformer, nominal_features),
        ('ord', ordinal_transformer, ordinal_features)
    ],
    remainder='drop' 
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [12]:
X_processed = model_pipeline.fit_transform(df)

def get_feature_names(column_transformer):
    feature_names = []
    
    # Loop through transformers
    for name, transformer, columns in column_transformer.transformers_:
        if name == 'num': # Numeric columns stay the same
            feature_names.extend(columns)
        elif name == 'ord': # Ordinal columns stay the same
            feature_names.extend(columns)
        elif name == 'nom': # OneHot creates new column names
            # Access the OneHotEncoder within the pipeline
            ohe = transformer.named_steps['onehot']
            feature_names.extend(ohe.get_feature_names_out(columns))
            
    return feature_names

try:
    feature_names = get_feature_names(preprocessor)
    df_clean = pd.DataFrame(X_processed, columns=feature_names)
except AttributeError:
    df_clean = pd.DataFrame(X_processed)

# Save the cleaned preprocessed dataset
output_filename = 'cleaned_dataset.csv'
df_clean.to_csv(output_filename, index=False)

print(f"Successfully cleaned and saved dataset to {output_filename}")

Successfully cleaned and saved dataset to cleaned_dataset.csv


In [13]:
df_clean.head()

Unnamed: 0,height,matches,goals_x,assists,minutes_played,age,value_x,position_Attack - Centre-Forward,position_Goalkeeper,position_Midfield - Defensive Midfield,foot_right,citizenship_Italy Brazil,citizenship_Jamaica England,citizenship_Montserrat England,citizenship_Spain,current_club_name_CR Flamengo,current_club_name_Chelmsford City,current_club_name_GD Estoril Praia,current_club_name_West Ham United U21,competition_name
0,2.90256,-0.845596,-0.32069,-0.504084,0.181561,0.38391,-0.445744,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,26.0
1,2.90256,-0.737622,-0.32069,-0.504084,0.181561,0.38391,-0.445744,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,26.0
2,2.90256,-0.899583,-0.32069,-0.504084,0.181561,0.38391,-0.445744,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,26.0
3,2.90256,-0.845596,-0.32069,-0.504084,8.239162,0.38391,-0.445744,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,26.0
4,2.90256,-0.737622,-0.32069,-0.504084,8.239162,0.38391,-0.445744,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,26.0


In [14]:
df_clean.shape

(75731, 20)

In [15]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75731 entries, 0 to 75730
Data columns (total 20 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   height                                  75731 non-null  float64
 1   matches                                 75731 non-null  float64
 2   goals_x                                 75731 non-null  float64
 3   assists                                 75731 non-null  float64
 4   minutes_played                          75731 non-null  float64
 5   age                                     75731 non-null  float64
 6   value_x                                 75731 non-null  float64
 7   position_Attack - Centre-Forward        75731 non-null  float64
 8   position_Goalkeeper                     75731 non-null  float64
 9   position_Midfield - Defensive Midfield  75731 non-null  float64
 10  foot_right                              75731 non-null  fl