In [2]:
import pandas as pd
import numpy as np

# Load your data
df = pd.read_csv('/content/merged_final.csv')

# -------------------------
# 1. PREPROCESS DATETIME COLUMN
# -------------------------

# If your dataset has a datetime column, replace the name here:
datetime_cols = [col for col in df.columns if 'date' in col.lower()]

for col in datetime_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

print("Datetime Columns Found:", datetime_cols)
print(df[datetime_cols].head())


# -------------------------
# 2. IDENTIFY OUTLIERS (IQR METHOD)
# -------------------------

numeric_cols = df.select_dtypes(include=['float64','int64']).columns

outliers = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outlier_points = df[(df[col] < lower) | (df[col] > upper)][col]
    outliers[col] = len(outlier_points)

print("\nOutlier Count per Column:")
for col, count in outliers.items():
    print(f"{col}: {count}")


# -------------------------
# 3. EXTRACT COLUMN TYPES INTO LISTS
# -------------------------

numerical_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# ORDINAL EXAMPLE: if you know specific ordinal columns, list them manually
# Example below, adjust based on your data:
ordinal_cols = []  # Add manually if needed

print("\nNumerical Columns:\n", numerical_cols)
print("\nCategorical Columns:\n", categorical_cols)
print("\nOrdinal Columns:\n", ordinal_cols)


Datetime Columns Found: ['date_of_birth', 'date_of_last_contract_extension', 'date_of_death', 'date_unix_x', 'date_unix_y', 'from_date', 'end_date']
  date_of_birth date_of_last_contract_extension date_of_death date_unix_x  \
0    1990-06-17                      2025-03-17           NaT  2025-05-29   
1    1990-06-17                      2025-03-17           NaT  2025-05-29   
2    1990-06-17                      2025-03-17           NaT  2025-05-29   
3    1990-06-17                      2025-03-17           NaT  2025-05-29   
4    1990-06-17                      2025-03-17           NaT  2025-05-29   

  date_unix_y  from_date   end_date  
0  2010-08-26 2025-09-03 2025-12-04  
1  2010-08-26 2025-09-03 2025-12-04  
2  2010-08-26 2025-09-03 2025-12-04  
3  2010-08-26 2025-09-03 2025-12-04  
4  2010-08-26 2025-09-03 2025-12-04  

Outlier Count per Column:
player_id: 0
height: 0
current_club_id: 0
outfitter: 0
player_agent_id: 0
contract_option: 0
on_loan_from_club_id: 0
on_loan_from_clu

  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')
