In [144]:
# %% Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score,average_precision_score, recall_score, f1_score, roc_auc_score, log_loss, confusion_matrix, classification_report,roc_curve, precision_recall_curve, auc
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from scipy.stats import uniform, randint
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import BorderlineSMOTE,ADASYN
import optuna

In [145]:
# %% Load Data
df = pd.read_csv("train.csv", index_col="id")
df_test = pd.read_csv("test.csv", index_col="id")

In [146]:
# %% Separate Features & Target
X = df.drop(columns="rainfall")
y = df["rainfall"]

In [147]:
# %% Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [148]:
# Apply SMOTE oversampling
smote = BorderlineSMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [149]:
# Apply ADASYN oversampling
adasyn = ADASYN(sampling_strategy='minority', random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_resampled, y_train_resampled)

In [150]:
y_train_resampled.value_counts()

rainfall
1    1320
0    1320
Name: count, dtype: int64

In [151]:
# %% Standard Scaling (Only for SVM & Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_val_scaled = scaler.transform(X_val)
df_test_scaled = scaler.transform(df_test)


In [152]:
# %% Hyperparameter grids
param_grids = {
    "SVM": {
        "C": uniform(0.1, 10),
        "kernel": ["linear", "rbf", "poly"],
        "gamma": ["scale", "auto"],
    },
    "Random Forest": {
        "n_estimators": randint(100, 1000),
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": randint(2, 10),
        "min_samples_leaf": randint(1, 10),
        "class_weight": ["balanced", None],
    },
    "Gradient Boosting": {
        "n_estimators": randint(50, 500),
        "learning_rate": uniform(0.01, 0.2),
        "max_depth": [3, 5, 10, 20],
        "subsample": uniform(0.5, 0.5),
    },
    "Logistic Regression": {
        "C": uniform(0.01, 10),
        "solver": ["lbfgs", "liblinear"],
        "class_weight": ["balanced", None],
    },
    "XGBoost": {
        "n_estimators": randint(50, 500),
        "learning_rate": uniform(0.01, 0.2),
        "max_depth": randint(3, 10),
        "subsample": uniform(0.5, 0.5),
        "colsample_bytree": uniform(0.5, 0.5),
    },
    "LightGBM": {
        "n_estimators": randint(50, 500),
        "learning_rate": uniform(0.01, 0.2),
        "max_depth": [-1, 5, 10, 20],
        "num_leaves": randint(10, 100),
    },
    "CatBoost": {
        "iterations": randint(50, 500),
        "learning_rate": uniform(0.01, 0.2),
        "depth": randint(3, 10),
    },
}

In [153]:
# Initialize models
models = {
    "SVM": SVC(probability=True, random_state=42, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
}



In [154]:
# Perform hyperparameter tuning
best_models = {}
model_scores = []

for model_name, model in models.items():
    print(f"Tuning {model_name}...")

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grids[model_name],
        n_iter=100,  # Increase for better optimization
        scoring='roc_auc_ovo_weighted',
        cv=5,
        n_jobs=-1,
        random_state=42
    )
    
    search.fit(X_train_scaled, y_train_resampled)
    best_model = search.best_estimator_
    best_models[model_name] = best_model

    # Evaluate on validation set
    y_pred = best_model.predict(X_val_scaled)
    y_pred_proba = best_model.predict_proba(X_val_scaled)[:, 1]

    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred_proba)

    model_scores.append({"Model": model_name, "F1 Score": f1, "ROC-AUC": roc_auc})

    print(f"Model: {model_name}")
    print(f"F1 Score: {f1}")
    print(f"ROC-AUC Score: {roc_auc}")
    print("------------------------")
    
    


Tuning SVM...
Model: SVM
F1 Score: 0.8566929133858268
ROC-AUC Score: 0.8251964085297419
------------------------
Tuning Random Forest...
Model: Random Forest
F1 Score: 0.8932926829268293
ROC-AUC Score: 0.8659231200897867
------------------------
Tuning Gradient Boosting...
Model: Gradient Boosting
F1 Score: 0.8962406015037594
ROC-AUC Score: 0.8647586980920314
------------------------
Tuning Logistic Regression...
Model: Logistic Regression
F1 Score: 0.8631239935587761
ROC-AUC Score: 0.8790404040404041
------------------------
Tuning XGBoost...
Model: XGBoost
F1 Score: 0.8892261001517451
ROC-AUC Score: 0.8494107744107744
------------------------
Tuning LightGBM...
[LightGBM] [Info] Number of positive: 1320, number of negative: 1320
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2778
[LightGBM] [Info] Number of data points in the train set: 2640,



Model: LightGBM
F1 Score: 0.8986384266263238
ROC-AUC Score: 0.8654320987654321
------------------------
Tuning CatBoost...
Model: CatBoost
F1 Score: 0.8813559322033898
ROC-AUC Score: 0.862598204264871
------------------------


In [155]:
pd.DataFrame(model_scores).sort_values(by='ROC-AUC', ascending=False)

Unnamed: 0,Model,F1 Score,ROC-AUC
3,Logistic Regression,0.863124,0.87904
1,Random Forest,0.893293,0.865923
5,LightGBM,0.898638,0.865432
2,Gradient Boosting,0.896241,0.864759
6,CatBoost,0.881356,0.862598
4,XGBoost,0.889226,0.849411
0,SVM,0.856693,0.825196


In [156]:
for model_name, model in best_models.items():
    print(f"Parameters for {model_name}:")
    print(model.get_params())
    print("\n")

Parameters for SVM:
{'C': 9.588855372533333, 'break_ties': False, 'cache_size': 200, 'class_weight': 'balanced', 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf', 'max_iter': -1, 'probability': True, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Parameters for Random Forest:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 30, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 223, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


Parameters for Gradient Boosting:
{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.11850804611097986, 'loss': 'log_loss', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decreas

In [157]:
for model_name, model in best_models.items():
    print(f"Optimizing threshold for {model_name}...")
    y_val_pred_proba = model.predict_proba(X_val_scaled)[:,1]
    # Calculate precision and recall
    precision, recall, pr_thresholds = precision_recall_curve(y_val, y_val_pred_proba)
    log_loss_val = log_loss(y_val, y_val_pred_proba)
    auprc = average_precision_score(y_val, y_val_pred_proba)
    
    # Calculate F1 score
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
    
    # Find optimal threshold
    optimal_threshold = pr_thresholds[np.argmax(f1)]
    print(f"Optimal threshold for F1 score: {optimal_threshold}")
    
    # Predict with optimized threshold
    y_pred = (y_val_pred_proba >= optimal_threshold).astype(int)
    
    # Evaluate with optimized threshold
    f1_optimized = f1_score(y_val, y_pred)
    log_loss_optimized = log_loss(y_val, y_pred)
    auprc_optimized = average_precision_score(y_val, y_pred)
    print(f"Optimized F1 Score: {f1_optimized}")
    print(f"Optimized Log Loss: {log_loss_optimized}")
    print(f"Optimized AU PRC: {auprc_optimized}")

    print( confusion_matrix(y_val, y_pred))
    print("------------------------")

    
    # print(f"{model_name}", classification_report(y_val,y_val_pred))
    
    # print(roc_auc_score(y_val,y_val_pred))



Optimizing threshold for SVM...
Optimal threshold for F1 score: 0.13008059378266365
Optimized F1 Score: 0.8879668049792531
Optimized Log Loss: 6.665607133603857
Optimized AU PRC: 0.8150656412498931
[[ 36  72]
 [  9 321]]
------------------------
Optimizing threshold for Random Forest...
Optimal threshold for F1 score: 0.38565022421524664
Optimized F1 Score: 0.9104258443465492
Optimized Log Loss: 5.019778211726361
Optimized AU PRC: 0.8753262634541175
[[ 67  41]
 [ 20 310]]
------------------------
Optimizing threshold for Gradient Boosting...
Optimal threshold for F1 score: 0.07182690313570068
Optimized F1 Score: 0.9120234604105572
Optimized Log Loss: 4.937486765632487
Optimized AU PRC: 0.8760322323483906
[[ 67  41]
 [ 19 311]]
------------------------
Optimizing threshold for Logistic Regression...
Optimal threshold for F1 score: 0.19422228923104318
Optimized F1 Score: 0.9154518950437318
Optimized Log Loss: 4.772903873444737
Optimized AU PRC: 0.8757874263646159
[[ 66  42]
 [ 16 314]]
-



In [None]:
# %% Make predictions on test set with optimized thresholds
test_predictions = {}
for model_name, model in best_models.items():
    y_pred_proba = model.predict(df_test_scaled)[:, 1]
    y_pred = (y_pred_proba >= optimal_threshold).astype(int)  # Use the optimal threshold determined earlier
    test_predictions[model_name] = y_pred



# predictions={}

# for model_name, model in best_models.items():
#     y_test_pred = model.predict(df_test_scaled)
#     df_test[f"Rainfall_{model_name}"] = y_test_pred
#     predictions[model_name] = y_test_pred

# df_test




In [159]:
# %% Save predictions to submission files
for model_name, predictions in test_predictions.items():
    submission_df = pd.DataFrame({"id": df_test.index, "rainfall": predictions})
    submission_df.to_csv(f"submission_{model_name}.csv", index=False)

In [160]:
# df_test[["Rainfall_Logistic Regression"]].to_csv("lr_prediction.csv")
# df_test[["Rainfall_Random Forest"]].to_csv("rf_prediction.csv")
# df_test[["Rainfall_CatBoost"]].to_csv("cat_prediction.csv")
# df_test[["Rainfall_XGBoost"]].to_csv("xgb_prediction.csv")
# df_test[["Rainfall_SVM"]].to_csv("svm_prediction.csv")
# df_test[["Rainfall_LightGBM"]].to_csv("lgbm_prediction.csv")
# df_test[["Rainfall_Gradient Boosting"]].to_csv("gbm_prediction.csv")