In [None]:

# 1. Upload ZIP file containing CSV or Excel files
from google.colab import files
import zipfile, os
import pandas as pd
import numpy as np

uploaded = files.upload()
zip_name = list(uploaded.keys())[0]
print("Uploaded ZIP:", zip_name)

# 2. Extract ZIP to a folder

extract_path = "/content/extracted_csv"
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_name, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("\nFiles extracted to:", extract_path)

# 3. Find both CSV + Excel files
all_files = os.listdir(extract_path)

csv_files = [f for f in all_files if f.endswith(".csv")]
excel_files = [f for f in all_files if f.endswith(".xlsx")]

print("\nCSV Found:", csv_files)
print("Excel Found:", excel_files)

dfs = {}
dfs_cleaned = {}

# 4. IQR Outlier Removal
def remove_outliers_iqr(df):
    num_cols = df.select_dtypes(include=np.number).columns
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

# 5. Handle datetime + feature engineering
def process_datetime(df):
    for col in df.columns:
        try:
            df[col] = pd.to_datetime(df[col])
            df[f"{col}_year"] = df[col].dt.year
            df[f"{col}_month"] = df[col].dt.month
            df[f"{col}_day"] = df[col].dt.day
            df[f"{col}_weekday"] = df[col].dt.weekday
        except:
            pass
    return df

# 6. Load all files + clean them
def load_file(file_path):
    if file_path.endswith(".csv"):
        return pd.read_csv(file_path)
    else:
        return pd.read_excel(file_path)

all_input_files = csv_files + excel_files

if len(all_input_files) == 0:
    raise Exception("❌ No CSV or Excel files found inside ZIP!")

for file in all_input_files:
    file_path = os.path.join(extract_path, file)

    df = load_file(file_path)
    dfs[file] = df

    # datetime processing
    df = process_datetime(df)

    # outlier cleaning
    cleaned = remove_outliers_iqr(df)

    dfs_cleaned[file] = cleaned
    print("Loaded & Cleaned:", file)

# 7. Show sample output
print("\nExample cleaned first 5 rows:")
first_df = list(dfs_cleaned.values())[0]
display(first_df.head())


Saving u-2 outliers removed excel.zip to u-2 outliers removed excel.zip
Uploaded ZIP: u-2 outliers removed excel.zip

Files extracted to: /content/extracted_csv

CSV Found: []
Excel Found: ['cleaned_player_profiles (2) (1).xlsx', 'cleaned_player_market_value (1) (1).xlsx', 'player_performances_cleaned_partial (1).xlsx', 'cleaned_tweets_premier_league_footballers_final (2).xlsx', 'player_injuries_cleaned_final (2) (1).xlsx']


  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])


Loaded & Cleaned: cleaned_player_profiles (2) (1).xlsx
Loaded & Cleaned: cleaned_player_market_value (1) (1).xlsx


  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])


Loaded & Cleaned: player_performances_cleaned_partial (1).xlsx


  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])


Loaded & Cleaned: cleaned_tweets_premier_league_footballers_final (2).xlsx


  df[col] = pd.to_datetime(df[col])
  df[col] = pd.to_datetime(df[col])


Loaded & Cleaned: player_injuries_cleaned_final (2) (1).xlsx

Example cleaned first 5 rows:


Unnamed: 0,player_id,player_slug,player_name,player_image_url,name_in_home_country,date_of_birth,place_of_birth,country_of_birth,height,citizenship,...,fourth_club_url_day,fourth_club_url_weekday,fourth_club_name_year,fourth_club_name_month,fourth_club_name_day,fourth_club_name_weekday,date_of_death_year,date_of_death_month,date_of_death_day,date_of_death_weekday


In [None]:

# 1. Upload ZIP
from google.colab import files
import zipfile, os, shutil
import pandas as pd
import numpy as np

uploaded = files.upload()
zip_name = list(uploaded.keys())[0]
print("Uploaded ZIP:", zip_name)

# 2. Extract ZIP
extract_dir = "/content/extracted_csv"
if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_name, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Files extracted to:", extract_dir)

# 3. Identify CSV & Excel Files
csv_files = []
excel_files = []

for root, dirs, files_in_dir in os.walk(extract_dir):
    for file in files_in_dir:
        if file.lower().endswith(".csv"):
            csv_files.append(os.path.join(root, file))
        elif file.lower().endswith((".xlsx", ".xls")):
            excel_files.append(os.path.join(root, file))

print("\nCSV Found:", csv_files)
print("Excel Found:", excel_files)

# 4. Outlier Detection + Capping (IQR)
def handle_outliers_iqr(df):
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - 3 * IQR
        upper = Q3 + 3 * IQR

        df[col] = df[col].clip(lower, upper)

    return df

# 5. Date Feature Engineering
def date_feature_engineering(df):

    for col in df.columns:
        if "date" in col.lower():
            try:
                df[col] = pd.to_datetime(df[col], errors="coerce")
            except:
                continue

            # Skip columns that failed conversion
            if df[col].dtype != "datetime64[ns]":
                continue

            base = col.replace("date", "").strip("_")

            # YEAR
            df[f"{base}_year"] = df[col].dt.year
            # MONTH
            df[f"{base}_month"] = df[col].dt.month
            # DAY
            df[f"{base}_day"] = df[col].dt.day
            # DAY OF WEEK
            df[f"{base}_weekday"] = df[col].dt.weekday
            # IS WEEKEND
            df[f"{base}_is_weekend"] = df[col].dt.weekday >= 5

            # NORMALIZED: DAYS SINCE MIN DATE
            df[f"{base}_days_from_start"] = (df[col] - df[col].min()).dt.days

    return df


# 6. Load → Clean → Feature Engineering
dfs_cleaned = {}

def load_and_clean(path):
    print("\nCleaning:", os.path.basename(path))

    # Load file
    if path.lower().endswith(".csv"):
        df = pd.read_csv(path)
    else:
        df = pd.read_excel(path, engine="openpyxl")

    # Date normalization + feature engineering
    df = date_feature_engineering(df)

    # Outlier handling
    df = handle_outliers_iqr(df)

    return df


# Load & clean all files
for file in csv_files + excel_files:
    cleaned_df = load_and_clean(file)
    dfs_cleaned[os.path.basename(file)] = cleaned_df
    print("→ Loaded & Cleaned:", os.path.basename(file))

# 7. Save Cleaned Files
cleaned_dir = "/content/cleaned_output"
if os.path.exists(cleaned_dir):
    shutil.rmtree(cleaned_dir)
os.makedirs(cleaned_dir, exist_ok=True)

for filename, df in dfs_cleaned.items():
    save_path = os.path.join(
        cleaned_dir,
        filename.replace(".xlsx", ".csv").replace(".xls", ".csv")
    )
    df.to_csv(save_path, index=False)
    print("Saved:", save_path)


# 8. Create ZIP of Cleaned Output
cleaned_zip = "cleaned_files.zip"
shutil.make_archive("cleaned_files", 'zip', cleaned_dir)

print("\nZIP created:", cleaned_zip)

files.download(cleaned_zip)


Saving u-2 outliers removed excel.zip to u-2 outliers removed excel (1).zip
Uploaded ZIP: u-2 outliers removed excel (1).zip
Files extracted to: /content/extracted_csv

CSV Found: []
Excel Found: ['/content/extracted_csv/cleaned_player_profiles (2) (1).xlsx', '/content/extracted_csv/cleaned_player_market_value (1) (1).xlsx', '/content/extracted_csv/player_performances_cleaned_partial (1).xlsx', '/content/extracted_csv/cleaned_tweets_premier_league_footballers_final (2).xlsx', '/content/extracted_csv/player_injuries_cleaned_final (2) (1).xlsx']

Cleaning: cleaned_player_profiles (2) (1).xlsx
→ Loaded & Cleaned: cleaned_player_profiles (2) (1).xlsx

Cleaning: cleaned_player_market_value (1) (1).xlsx
→ Loaded & Cleaned: cleaned_player_market_value (1) (1).xlsx

Cleaning: player_performances_cleaned_partial (1).xlsx
→ Loaded & Cleaned: player_performances_cleaned_partial (1).xlsx

Cleaning: cleaned_tweets_premier_league_footballers_final (2).xlsx
