# ScoreSight - Part 4: Encoding and Feature Selection

**Author:** Prathamesh Fuke  
**Branch:** Prathamesh_Fuke  
**Date:** October 28, 2025

## Objective
Prepare data for machine learning models:
- Encode categorical variables
- Select relevant features for each prediction task
- Handle multicollinearity
- Scale numerical features
- Create final datasets for modeling

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
print("✓ Libraries imported")

In [None]:
# Load feature-engineered datasets
print("Loading feature-engineered datasets...")
match_data = pd.read_csv('data_features_match.csv')
player_data = pd.read_csv('data_features_player.csv')
league_data = pd.read_csv('data_features_league.csv')
print(f"✓ Match data: {match_data.shape}")
print(f"✓ Player data: {player_data.shape}")
print(f"✓ League data: {league_data.shape}")

## 2. Encode Categorical Variables

### 2.1 Encode Match Data

In [None]:
print("="*80)
print("ENCODING MATCH DATA")
print("="*80)

match_encoded = match_data.copy()

# Identify categorical columns
categorical_cols = match_encoded.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns found: {len(categorical_cols)}")
print(categorical_cols)

In [None]:
# Label encoding for categorical variables
label_encoders_match = {}

for col in categorical_cols:
    le = LabelEncoder()
    match_encoded[col + '_encoded'] = le.fit_transform(match_encoded[col].astype(str))
    label_encoders_match[col] = le
    print(f"✓ Encoded '{col}' → '{col}_encoded' ({len(le.classes_)} unique values)")

print(f"\nTotal encoders created: {len(label_encoders_match)}")

### 2.2 Encode Player Data

In [None]:
print("="*80)
print("ENCODING PLAYER DATA")
print("="*80)

player_encoded = player_data.copy()

# Identify categorical columns
categorical_cols = player_encoded.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns found: {len(categorical_cols)}")
print(categorical_cols)

In [None]:
# Label encoding for categorical variables
label_encoders_player = {}

for col in categorical_cols:
    le = LabelEncoder()
    player_encoded[col + '_encoded'] = le.fit_transform(player_encoded[col].astype(str))
    label_encoders_player[col] = le
    print(f"✓ Encoded '{col}' → '{col}_encoded' ({len(le.classes_)} unique values)")

print(f"\nTotal encoders created: {len(label_encoders_player)}")

### 2.3 Encode League Data

In [None]:
print("="*80)
print("ENCODING LEAGUE DATA")
print("="*80)

league_encoded = league_data.copy()

# Identify categorical columns
categorical_cols = league_encoded.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns found: {len(categorical_cols)}")
print(categorical_cols)

In [None]:
# Label encoding for categorical variables
label_encoders_league = {}

for col in categorical_cols:
    le = LabelEncoder()
    league_encoded[col + '_encoded'] = le.fit_transform(league_encoded[col].astype(str))
    label_encoders_league[col] = le
    print(f"✓ Encoded '{col}' → '{col}_encoded' ({len(le.classes_)} unique values)")

print(f"\nTotal encoders created: {len(label_encoders_league)}")

## 3. Feature Selection

### 3.1 Select Features for Match Outcome Prediction

In [None]:
print("="*80)
print("FEATURE SELECTION FOR MATCH OUTCOME PREDICTION")
print("="*80)

# Display all available columns
print(f"\nAvailable columns in match data: {match_encoded.shape[1]}")
print("\nColumn list:")
for i, col in enumerate(match_encoded.columns, 1):
    print(f"{i:3d}. {col}")

In [None]:
# Select numeric columns only for modeling
numeric_cols_match = match_encoded.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric columns available: {len(numeric_cols_match)}")
print(numeric_cols_match)

### 3.2 Select Features for Top Scorer Prediction

In [None]:
print("="*80)
print("FEATURE SELECTION FOR TOP SCORER PREDICTION")
print("="*80)

# Display all available columns
print(f"\nAvailable columns in player data: {player_encoded.shape[1]}")
print("\nColumn list:")
for i, col in enumerate(player_encoded.columns, 1):
    print(f"{i:3d}. {col}")

In [None]:
# Select numeric columns only for modeling
numeric_cols_player = player_encoded.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric columns available: {len(numeric_cols_player)}")
print(numeric_cols_player)

### 3.3 Select Features for Points Tally Prediction

In [None]:
print("="*80)
print("FEATURE SELECTION FOR POINTS TALLY PREDICTION")
print("="*80)

# Display all available columns
print(f"\nAvailable columns in league data: {league_encoded.shape[1]}")
print("\nColumn list:")
for i, col in enumerate(league_encoded.columns, 1):
    print(f"{i:3d}. {col}")

In [None]:
# Select numeric columns only for modeling
numeric_cols_league = league_encoded.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric columns available: {len(numeric_cols_league)}")
print(numeric_cols_league)

## 4. Feature Scaling

In [None]:
print("="*80)
print("FEATURE SCALING")
print("="*80)

# Note: Scaling will be applied during model training
# We'll save the unscaled data and scale during train/test split
print("\nFeature scaling will be applied during model training phase")
print("This ensures proper train/test separation and prevents data leakage")

## 5. Create Final Modeling Datasets

In [None]:
print("="*80)
print("CREATING FINAL MODELING DATASETS")
print("="*80)

# Match prediction dataset (numeric columns only)
match_final = match_encoded[numeric_cols_match].copy()
print(f"\n✓ Match prediction dataset: {match_final.shape}")

# Top scorer prediction dataset (numeric columns only)
player_final = player_encoded[numeric_cols_player].copy()
print(f"✓ Top scorer prediction dataset: {player_final.shape}")

# Points tally prediction dataset (numeric columns only)
league_final = league_encoded[numeric_cols_league].copy()
print(f"✓ Points tally prediction dataset: {league_final.shape}")

## 6. Data Summary

In [None]:
print("="*80)
print("FINAL DATA SUMMARY")
print("="*80)

print("\n1. Match Outcome Prediction:")
print(f"   - Samples: {match_final.shape[0]:,}")
print(f"   - Features: {match_final.shape[1]}")
print(f"   - Missing values: {match_final.isnull().sum().sum()}")

print("\n2. Top Scorer Prediction:")
print(f"   - Samples: {player_final.shape[0]:,}")
print(f"   - Features: {player_final.shape[1]}")
print(f"   - Missing values: {player_final.isnull().sum().sum()}")

print("\n3. Points Tally Prediction:")
print(f"   - Samples: {league_final.shape[0]:,}")
print(f"   - Features: {league_final.shape[1]}")
print(f"   - Missing values: {league_final.isnull().sum().sum()}")

## 7. Save Processed Data

In [None]:
print("\nSaving final modeling datasets...")

# Save encoded datasets (with all columns)
match_encoded.to_csv('data_encoded_match.csv', index=False)
player_encoded.to_csv('data_encoded_player.csv', index=False)
league_encoded.to_csv('data_encoded_league.csv', index=False)
print("✓ Encoded datasets saved")

# Save final modeling datasets (numeric only)
match_final.to_csv('data_final_match_prediction.csv', index=False)
player_final.to_csv('data_final_top_scorer.csv', index=False)
league_final.to_csv('data_final_points_tally.csv', index=False)
print("✓ Final modeling datasets saved")

print("\n" + "="*80)
print("NOTEBOOK 04 COMPLETED - Ready for Visualization and Modeling")
print("="*80)