<a href="https://colab.research.google.com/github/rajendran-official/AI_ML_COURSE_ICT/blob/Third-Intermediate-Assessment---Supervised-Learning/Third_Intermediate_Assessment_Supervised_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries & Load Data

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')





In [14]:
train_df = pd.read_csv('trainfile.csv')
test_df  = pd.read_csv('testfile.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [15]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Data loaded successfully!")

Train shape: (54808, 14)
Test shape: (23490, 13)
Data loaded successfully!


Feature Engineering

In [22]:
# Cell 2: Create powerful new features
def add_features(df):
    df = df.copy()

    df['kpi_and_award'] = df['KPIs_met >80%'] * df['awards_won?']

    df['rating_x_kpi'] = df['previous_year_rating'] * df['KPIs_met >80%']

    dept_mean = df.groupby('department')['avg_training_score'].transform('mean')
    df['score_vs_dept_mean'] = df['avg_training_score'] / (dept_mean + 1)

    df['age_per_service'] = df['age'] / (df['length_of_service'] + 1)

    df['training_effort'] = df['no_of_trainings'] * df['avg_training_score']

    bins = [0, 50, 60, 70, 80, 90, 100]
    labels = [0,1,2,3,4,5]
    df['score_bin'] = pd.cut(df['avg_training_score'], bins=bins, labels=labels, include_lowest=True)

    return df

print("Adding features...")
train_df = add_features(train_df)
test_df  = add_features(test_df)
print("Features added!")

Adding features...
Features added!


Handle Missing Values & Encoding

In [17]:
print("Handling missing values and encoding...")

train_df['previous_year_rating'] = train_df['previous_year_rating'].fillna(train_df['previous_year_rating'].median())
test_df['previous_year_rating']  = test_df['previous_year_rating'].fillna(train_df['previous_year_rating'].median())

train_df['education'] = train_df['education'].fillna(train_df['education'].mode()[0])
test_df['education']  = test_df['education'].fillna(train_df['education'].mode()[0])

cat_cols = ['department', 'region', 'education', 'gender', 'recruitment_channel']

for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    test_df[col] = test_df[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

print("Preprocessing done!")

Handling missing values and encoding...
Preprocessing done!


Prepare Features and Scale

In [18]:
features = [
    'department', 'region', 'education', 'gender', 'recruitment_channel',
    'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
    'KPIs_met >80%', 'awards_won?', 'avg_training_score',
    'kpi_and_award', 'rating_x_kpi', 'score_vs_dept_mean',
    'age_per_service', 'training_effort', 'score_bin'
]

X = train_df[features]
y = train_df['is_promoted']
X_test = test_df[features]

num_cols = ['age', 'previous_year_rating', 'length_of_service', 'avg_training_score',
            'score_vs_dept_mean', 'age_per_service', 'training_effort']

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print("Features ready! X shape:", X.shape)

Features ready! X shape: (54808, 18)


Train LightGBM Model

In [19]:
print("Training LightGBM model...")

lgb_model = lgb.LGBMClassifier(
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=7,
    min_child_samples=40,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    is_unbalance=True
)

lgb_model.fit(X, y)
print("Model training completed!")

Training LightGBM model...
[LightGBM] [Info] Number of positive: 4668, number of negative: 50140
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 913
[LightGBM] [Info] Number of data points in the train set: 54808, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085170 -> initscore=-2.374088
[LightGBM] [Info] Start training from score -2.374088
Model training completed!


Find Best Threshold

In [20]:
print("Finding best threshold for F1 score...")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_proba = np.zeros(len(y))

for train_idx, val_idx in skf.split(X, y):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    lgb_model.fit(X_tr, y_tr)
    oof_proba[val_idx] = lgb_model.predict_proba(X_va)[:, 1]

best_thresh = 0.5
best_f1 = 0
for thresh in np.arange(0.3, 0.6, 0.01):
    pred = (oof_proba >= thresh).astype(int)
    f1 = f1_score(y, pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"Best threshold: {best_thresh:.3f}")
print(f"Cross-validation F1 score: {best_f1:.4f}")

Finding best threshold for F1 score...
[LightGBM] [Info] Number of positive: 3734, number of negative: 40112
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 43846, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085162 -> initscore=-2.374195
[LightGBM] [Info] Start training from score -2.374195
[LightGBM] [Info] Number of positive: 3734, number of negative: 40112
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004915 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 917
[LightGBM] [Info] Number of data points in the train set: 43846, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg

Make Final Prediction & Save Submission

In [21]:
print("Making final predictions...")

test_proba = lgb_model.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= best_thresh).astype(int)

submission_df['is_promoted'] = test_pred
submission_df.to_csv('submission_final.csv', index=False)

print("Submission saved as 'submission_final.csv'")
print("Ready to download and submit!")

from google.colab import files
files.download('submission_final.csv')

Making final predictions...
Submission saved as 'submission_final.csv'
Ready to download and submit!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>