In [44]:
pip install optuna



In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import optuna
import xgboost as xgb


In [46]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

In [47]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2190 non-null   int64  
 1   day            2190 non-null   int64  
 2   pressure       2190 non-null   float64
 3   maxtemp        2190 non-null   float64
 4   temparature    2190 non-null   float64
 5   mintemp        2190 non-null   float64
 6   dewpoint       2190 non-null   float64
 7   humidity       2190 non-null   float64
 8   cloud          2190 non-null   float64
 9   sunshine       2190 non-null   float64
 10  winddirection  2190 non-null   float64
 11  windspeed      2190 non-null   float64
 12  rainfall       2190 non-null   int64  
dtypes: float64(10), int64(3)
memory usage: 222.6 KB


In [48]:
train = train.drop(columns=['id','day'])

In [49]:
print("\nBasic Statistics:\n", train.describe())


Basic Statistics:
           pressure      maxtemp  temparature      mintemp     dewpoint  \
count  2190.000000  2190.000000  2190.000000  2190.000000  2190.000000   
mean   1013.602146    26.365799    23.953059    22.170091    20.454566   
std       5.655366     5.654330     5.222410     5.059120     5.288406   
min     999.000000    10.400000     7.400000     4.000000    -0.300000   
25%    1008.600000    21.300000    19.300000    17.700000    16.800000   
50%    1013.000000    27.800000    25.500000    23.850000    22.150000   
75%    1017.775000    31.200000    28.400000    26.400000    25.000000   
max    1034.600000    36.000000    31.500000    29.800000    26.700000   

          humidity        cloud     sunshine  winddirection    windspeed  \
count  2190.000000  2190.000000  2190.000000    2190.000000  2190.000000   
mean     82.036530    75.721918     3.744429     104.863151    21.804703   
std       7.800654    18.026498     3.626327      80.002416     9.898659   
min      

In [50]:
duplicate_rows = train.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_rows}")



Number of duplicate rows: 0


In [51]:
# def remove_outliers(data, columns, threshold=1.5):
#     Q1 = data[columns].quantile(0.25)
#     Q3 = data[columns].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - threshold * IQR
#     upper_bound = Q3 + threshold * IQR
#     return data[~((data[columns] < lower_bound) | (data[columns] > upper_bound)).any(axis=1)]

# numerical_features = ["pressure", "maxtemp", "temparature", "mintemp", "dewpoint",
#                       "humidity", "cloud", "sunshine", "winddirection", "windspeed"]

# df_cleaned = remove_outliers(train, numerical_features)


In [52]:
train["temp_range"] = train["maxtemp"] - train["mintemp"]
train["humidity_sun_ratio"] = train["humidity"] / (train["sunshine"] + 1)
train["wind_effect"] = train["windspeed"] * np.cos(np.radians(train["winddirection"]))

In [53]:
X = train.drop(columns=["rainfall"])
y = train["rainfall"]

In [54]:
scaler = StandardScaler()
train = scaler.fit_transform(train)


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [56]:
"""Objective function for Optuna to optimize XGBClassifier"""
def objective(trial):
    params = {
        "n_estimators":trial.suggest_int("n_estimators",100,1000,step=50),
        "learning_rate":trial.suggest_loguniform("learning_rate",0.01,0.3),
        "max_depth": trial.suggest_int("max_depth",3,15),
        "min_child_weight": trial.suggest_int("min_child_weight",1,10),
        "subsample": trial.suggest_float("subsample",0.5,1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "lambda": trial.suggest_float("lambda", 0, 5),
        "alpha": trial.suggest_float("alpha", 0, 5),
        "objective": "binary:logistic",  # Use 'multi:softmax' for multi-class classification
        "eval_metric": "logloss",
        "use_label_encoder": False
    }

    # Train XGBClassifier with suggested parameters
    model = xgb.XGBClassifier(**params, random_state=42)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)

    return 1-accuracy

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

print(f"Best parameters:{study.best_params}")

[I 2025-03-25 21:15:34,930] A new study created in memory with name: no-name-19d2236e-91f0-4856-abc4-560b4a6301d8
  "learning_rate":trial.suggest_loguniform("learning_rate",0.01,0.3),
Parameters: { "use_label_encoder" } are not used.

[I 2025-03-25 21:15:36,078] Trial 0 finished with value: 0.12557077625570778 and parameters: {'n_estimators': 900, 'learning_rate': 0.08262266417815065, 'max_depth': 5, 'min_child_weight': 5, 'subsample': 0.6417345953963853, 'colsample_bytree': 0.6542763836017963, 'gamma': 2.101665564978022, 'lambda': 1.3337306733558396, 'alpha': 3.719635038979342}. Best is trial 0 with value: 0.12557077625570778.
  "learning_rate":trial.suggest_loguniform("learning_rate",0.01,0.3),
Parameters: { "use_label_encoder" } are not used.

[I 2025-03-25 21:15:36,884] Trial 1 finished with value: 0.13013698630136983 and parameters: {'n_estimators': 900, 'learning_rate': 0.03215604655165262, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.9841157738651412, 'colsample_bytree'

Best parameters:{'n_estimators': 600, 'learning_rate': 0.023853922382295195, 'max_depth': 11, 'min_child_weight': 3, 'subsample': 0.6225689297105262, 'colsample_bytree': 0.9802911158664088, 'gamma': 3.3938318237979286, 'lambda': 2.4313788887623007, 'alpha': 4.771193751884831}


In [57]:
best_params = study.best_params

final_model = xgb.XGBClassifier(**best_params, random_state=42)
final_model.fit(X_train, y_train)

# Make predictions
y_pred = final_model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.4f}")

Final Model Accuracy: 0.8858


In [58]:
test.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


In [59]:
#test = remove_outliers(test, numerical_features)

test["temp_range"] = test["maxtemp"] - test["mintemp"]
test["humidity_sun_ratio"] = test["humidity"] / (test["sunshine"] + 1)
test["wind_effect"] =test["windspeed"] * np.cos(np.radians(test["winddirection"]))

In [60]:
test_id = test['id']
test = test.drop(columns=['id','day'])
prediction = final_model.predict(test)

In [61]:
prob_class_1 = final_model.predict_proba(test)[:, 1]

In [62]:
submission = pd.DataFrame({
    "id": test_id,
    "rainfall": prob_class_1
})

# Save submission file
submission.to_csv("submission.csv", index=False)