In [None]:
!pip install chardet
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import chardet
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

path = "/content/drive/MyDrive/datasets_2/"

def detect_encoding(file_path, num_bytes=50000):
    with open(file_path, 'rb') as f:
        return chardet.detect(f.read(num_bytes))['encoding']

def safe_load_csv(file_path, chunksize=250000):
    encoding = detect_encoding(file_path)
    print("Encoding:", encoding)
    try:
        return pd.concat(
            pd.read_csv(file_path, encoding=encoding, chunksize=chunksize, low_memory=False, on_bad_lines='skip'),
            ignore_index=True
        )
    except:
        return pd.concat(
            pd.read_csv(file_path, encoding='ISO-8859-1', chunksize=chunksize, low_memory=False, on_bad_lines='skip'),
            ignore_index=True
        )

def clean_dataframe(df):
    df = df.drop_duplicates()
    num_cols = df.select_dtypes(include=['int64','float64']).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if df[col].isna().sum() > 0:
            df[col] = df[col].fillna(df[col].mode()[0])
    return df

def safe_ohe(df, columns):
    if len(columns) == 0: return df
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded = enc.fit_transform(df[columns])
    df2 = pd.DataFrame(encoded, columns=enc.get_feature_names_out(columns))
    df = df.drop(columns, axis=1)
    return pd.concat([df.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
files = {
    "performances": path + "player_performances.csv",
    "profiles": path + "player_profiles.csv",
    "tweets": path + "tweets_premier_league_footballers.csv",
    "injuries": path + "player_injuries.csv",
    "market": path + "player_latest_market_value.csv"
}

# CLEAN FILES
cleaned_files = {}
for name, file_path in files.items():
    df = safe_load_csv(file_path)
    df = clean_dataframe(df)
    out_path = path + f"{name}_s.csv"
    df.to_csv(out_path, index=False)
    cleaned_files[name] = out_path

# MERGE TWEETS WITH PROFILES (name match)
profiles_df = safe_load_csv(cleaned_files["profiles"])
tweets_df   = safe_load_csv(cleaned_files["tweets"])

profiles_df["name_clean"] = profiles_df["player_name"].str.lower().str.strip()
tweets_df["name_clean"]   = tweets_df["player_name"].str.lower().str.strip()

tweets_merged = tweets_df.merge(
    profiles_df[["player_id","name_clean"]],
    on="name_clean",
    how="left"
).drop(columns=["name_clean"])

tweets_path = path + "tweets_s.csv"
tweets_merged.to_csv(tweets_path, index=False)
cleaned_files["tweets"] = tweets_path

# MASTER FILE 1
merge_order = ["profiles","performances","injuries","market","tweets"]
master1 = None

for name in merge_order:
    df = safe_load_csv(cleaned_files[name])
    master1 = df if master1 is None else master1.merge(df, on="player_id", how="left")

master1 = master1.drop_duplicates()
master1_path = path + "master_file1_cleaned.csv"
master1.to_csv(master1_path, index=False)

master1_path


Encoding: utf-8
Encoding: utf-8
Encoding: ascii


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[num_cols] = df[num_cols].fillna(df[num_cols].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mode()[0])


Encoding: ascii


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[num_cols] = df[num_cols].fillna(df[num_cols].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mode()[0])


Encoding: ascii
Encoding: utf-8
Encoding: ascii
Encoding: utf-8
Encoding: utf-8
Encoding: ascii
Encoding: ascii
Encoding: ascii


'/content/drive/MyDrive/datasets_2/master_file1_cleaned.csv'

In [None]:
master1 = safe_load_csv(master1_path)
print("Master1 loaded:", master1.shape)


Encoding: utf-8
Master1 loaded: (3749436, 70)


In [None]:
cat_cols = master1.select_dtypes(include=['object']).columns

cols_to_drop = [col for col in cat_cols if master1[col].nunique() > 200]

print("Dropping high-cardinality columns:")
print(cols_to_drop)

master1_reduced = master1.drop(columns=cols_to_drop)
print("Reduced Master1 shape:", master1_reduced.shape)
master1_small = master1_reduced.sample(n=50000, random_state=42)
print("Sampled dataset shape:", master1_small.shape)



Dropping high-cardinality columns:
['player_slug', 'player_name_x', 'player_image_url', 'name_in_home_country', 'date_of_birth', 'place_of_birth', 'country_of_birth', 'citizenship', 'current_club_name', 'joined', 'social_media_url', 'player_agent_name', 'date_of_last_contract_extension', 'on_loan_from_club_name', 'second_club_url', 'second_club_name', 'date_of_death', 'competition_id', 'competition_name', 'team_name', 'injury_reason', 'from_date', 'end_date', 'date_unix']
Reduced Master1 shape: (3749436, 46)
Sampled dataset shape: (50000, 46)


In [None]:
cat_cols = master1_small.select_dtypes(include=['object']).columns

low_card = [col for col in cat_cols if master1_small[col].nunique() < 50]
high_card = [col for col in cat_cols if col not in low_card]

print("Low-cardinality:", low_card)
print("High-cardinality:", high_card)
master2 = safe_ohe(master1_small, low_card)
print("After OHE:", master2.shape)



Low-cardinality: ['position', 'main_position', 'foot', 'outfitter', 'contract_option', 'contract_there_expires', 'third_club_url', 'third_club_name', 'fourth_club_url', 'fourth_club_name', 'season_name_y']
High-cardinality: ['contract_expires', 'season_name_x']
After OHE: (50000, 161)


In [None]:
from sklearn.preprocessing import LabelEncoder

for col in high_card:
    le = LabelEncoder()
    master2[col] = le.fit_transform(master2[col].astype(str))

print("After Label Encoding:", master2.shape)


After Label Encoding: (50000, 161)


In [None]:
master2_path = path + "master_file2_preprocessed_small.csv"
master2.to_csv(master2_path, index=False)

print("Master File 2 saved:", master2_path)


Master File 2 saved: /content/drive/MyDrive/datasets_2/master_file2_preprocessed_small.csv


In [None]:
master1 = safe_load_csv(master1_path)

master1_small = master1.sample(n = 50000, random_state=42)

cat_cols = master1.select_dtypes(include=['object']).columns
low_card = [c for c in cat_cols if master1[c].nunique() < 50]
high_card = [c for c in cat_cols if c not in low_card]

print("Low-cardinality:", low_card)
print("High-cardinality:", high_card)

master2 = safe_ohe(master1, low_card)

for col in high_card:
    le = LabelEncoder()
    master2[col] = le.fit_transform(master2[col].astype(str))

master2_path = path + "master_file2_preprocessed.csv"
master2.to_csv(master2_path, index=False)
master2_path


Encoding: utf-8
Low-cardinality: ['position', 'main_position', 'foot', 'outfitter', 'contract_option', 'contract_there_expires', 'third_club_url', 'third_club_name', 'fourth_club_url', 'fourth_club_name', 'season_name_y']
High-cardinality: ['player_slug', 'player_name_x', 'player_image_url', 'name_in_home_country', 'date_of_birth', 'place_of_birth', 'country_of_birth', 'citizenship', 'current_club_name', 'joined', 'contract_expires', 'social_media_url', 'player_agent_name', 'date_of_last_contract_extension', 'on_loan_from_club_name', 'second_club_url', 'second_club_name', 'date_of_death', 'season_name_x', 'competition_id', 'competition_name', 'team_name', 'injury_reason', 'from_date', 'end_date', 'date_unix']
