In [None]:
from google.colab import files
import zipfile, os, shutil
import pandas as pd
import glob
from functools import reduce

# Upload
uploaded = files.upload()

zip_name = list(uploaded.keys())[0]
print("Uploaded ZIP:", zip_name)

# 2. Create clean data folder
base_dir = "/content/data"
if os.path.exists(base_dir):
    shutil.rmtree(base_dir)
os.makedirs(base_dir)

# 3. Extract ZIP safely
extract_dir = "/content/extracted"
if os.path.exists(extract_dir):
    shutil.rmtree(extract_dir)
os.makedirs(extract_dir)

with zipfile.ZipFile(zip_name, 'r') as z:
    z.extractall(extract_dir)

print("Zip extracted to:", extract_dir)

# 4. Find ALL CSV files inside ZIP (even inside folders)

csv_paths = glob.glob(extract_dir + "/**/*.csv", recursive=True)

print("\nFound CSV files:")
for path in csv_paths:
    print(" -", path)

# 5. Move CSVs to /content/data and flatten names
required_files = {
    "player_profiles": None,
    "player_performances": None,
    "player_market_value": None,
    "player_injuries": None,
    "tweets_premier_league_footballers": None
}

for file_path in csv_paths:
    name = os.path.basename(file_path).replace(".csv", "")
    for key in required_files:
        if key in name:
            new_path = f"{base_dir}/{key}.csv"
            shutil.copy(file_path, new_path)
            required_files[key] = new_path

# 6. Check missing files
missing = [k for k, v in required_files.items() if v is None]

if missing:
    print("\n Missing required CSV files:", missing)
    raise FileNotFoundError("Fix ZIP structure or missing file.")
else:
    print("\n All required CSV files found and flattened!")

# 7. LOAD CLEANED CSV FILES
dfs = {}
for key, path in required_files.items():
    dfs[key] = pd.read_csv(path, encoding="latin1", low_memory=False)
    print(f"Loaded: {key}.csv → shape = {dfs[key].shape}")

# 8. CLEAN FILES

cleaned = {}
for name, df in dfs.items():
    df = df.drop_duplicates()
    df = df.dropna(how="all")
    cleaned[name] = df
    print(f"{name}: cleaned shape = {df.shape}")

# 9. MERGE DATA USING player_id
merge_key = "player_id"

merge_list = [
    cleaned["player_profiles"],
    cleaned["player_performances"],
    cleaned["player_market_value"],
    cleaned["player_injuries"]
]

master = reduce(lambda left, right: pd.merge(left, right, on=merge_key, how="outer"), merge_list)

print("\nMerged master table shape:", master.shape)

# 10. HANDLE MISSING COLUMNS
threshold = 0.6
cols_to_keep = [c for c in master.columns if master[c].isna().mean() < threshold]
master = master[cols_to_keep]

print("After dropping high-missing columns:", master.shape)

# 11. ONE-HOT ENCODING
categorical = master.select_dtypes(include=["object"]).columns
safe = [c for c in categorical if master[c].nunique() < 40]

master_encoded = pd.get_dummies(master, columns=safe, dummy_na=True)

print("Final encoded shape:", master_encoded.shape)

# 12. SAVE FINAL FILE
master_encoded.to_csv("/content/master_preprocessed.csv", index=False)
print("\n FINAL FILE SAVED: master_preprocessed.csv")

Saving archive.zip to archive.zip
Uploaded ZIP: archive.zip
Zip extracted to: /content/extracted

Found CSV files:
 - /content/extracted/tweets_premier_league_footballers (1).csv
 - /content/extracted/player_performances/player_performances.csv
 - /content/extracted/player_profiles/player_profiles.csv
 - /content/extracted/player_injuries/player_injuries.csv
 - /content/extracted/player_market_value/player_market_value.csv

 All required CSV files found and flattened!
Loaded: player_profiles.csv → shape = (92671, 34)
Loaded: player_performances.csv → shape = (1878719, 20)
Loaded: player_market_value.csv → shape = (901429, 3)
Loaded: player_injuries.csv → shape = (143195, 7)
Loaded: tweets_premier_league_footballers.csv → shape = (167841, 9)
player_profiles: cleaned shape = (92671, 34)
player_performances: cleaned shape = (1878719, 20)
player_market_value: cleaned shape = (901429, 3)
player_injuries: cleaned shape = (143084, 7)
tweets_premier_league_footballers: cleaned shape = (161169,