In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report



In [None]:
file_path = "/kaggle/input/playground-series-s5e3/train.csv"
df = pd.read_csv(file_path)
df

In [None]:
# 1️⃣ Check for missing values
print("Missing values per column:\n", df.isnull().sum())

In [None]:
# 2️⃣ Drop unnecessary columns (e.g., 'id' and 'day' which are not useful for prediction)
df = df.drop(columns=["id", "day"])


In [None]:

# 4️⃣ Define features (X) and target (y)
X = df.drop(columns=["rainfall"])  # Features
y = df["rainfall"]  # Target (binary classification)



In [None]:
# 5️⃣ Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [None]:
# 6️⃣ Feature scaling (Normalization) - Only applied to numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:

# ✅ Final output
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts())
print("y_test distribution:\n", y_test.value_counts())


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = "/kaggle/input/playground-series-s5e3/train.csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
df = df.drop(columns=["id", "day"])

# Define features (X) and target (y)
X = df.drop(columns=["rainfall"])  # Features
y = df["rainfall"]  # Target variable (0 or 1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ✅ Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# ✅ Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# ✅ XGBoost Model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# 🎯 Model Evaluation
print("📌 Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

print("\n📌 Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("\n📌 XGBoost Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


In [None]:

# ✅ Hyperparameter tuning for Random Forest
rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(rf, rf_params, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)
grid_rf.fit(X_train, y_train)

# ✅ Best Random Forest Model
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("\n📌 Best Random Forest Model:")
print("Best Parameters:", grid_rf.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


# ✅ Hyperparameter tuning for XGBoost
xgb_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
grid_xgb = GridSearchCV(xgb, xgb_params, cv=5, scoring="accuracy", n_jobs=-1, verbose=2)
grid_xgb.fit(X_train, y_train)

# ✅ Best XGBoost Model
best_xgb = grid_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print("\n📌 Best XGBoost Model:")
print("Best Parameters:", grid_xgb.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importance from the best XGBoost model
feature_importance = best_xgb.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by="Importance", ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df, palette="viridis")
plt.title("Feature Importance (XGBoost)")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()


The best model is XGBoost with the following results:

Accuracy: 87.44%
Precision (Class 1): 0.89, Recall (Class 1): 0.95, F1-Score (Class 1): 0.92
Precision (Class 0): 0.80, Recall (Class 0): 0.65, F1-Score (Class 0): 0.72
Weighted Avg. F1-Score: 0.87

In [None]:
from xgboost import XGBClassifier

# Train the best XGBoost model with class weights
best_xgb_model = XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.01,
    max_depth=7,
    n_estimators=300,
    subsample=0.8,
    scale_pos_weight= (len(y_train) - sum(y_train)) / sum(y_train)  # Handle imbalance
)

# Fit the model
best_xgb_model.fit(X_train, y_train)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for XGBoost
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [6, 7, 8],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# Use GridSearchCV for hyperparameter optimization
grid_search = GridSearchCV(estimator=XGBClassifier(), param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_xgb_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)


In [None]:
from xgboost import XGBClassifier

# Train the model (assuming you have X_train, y_train)
model = XGBClassifier()
model.fit(X_train, y_train)


In [None]:
y_prob = model.predict_proba(X_test)[:, 1]  # Probability for Class 1 (rain)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load train dataset
train_file_path = "/kaggle/input/playground-series-s5e3/train.csv"
df = pd.read_csv(train_file_path)

# Drop unnecessary columns
df = df.drop(columns=["id", "day"])

# Define features (X) and target (y)
X = df.drop(columns=["rainfall"])  # Features
y = df["rainfall"]  # Target variable (0 or 1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ✅ Train XGBoost Model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)

# 🎯 Model Evaluation
y_pred_xgb = xgb_model.predict(X_test)
print("\n📌 XGBoost Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

# ✅ Load Kaggle Test Dataset
test_file_path = "/kaggle/input/playground-series-s5e3/test.csv"
test_data = pd.read_csv(test_file_path)

# Drop unnecessary columns (assumes same structure as train)
test_ids = test_data["id"]  # Save 'id' for submission
test_data = test_data.drop(columns=["id", "day"])

# Scale test data
X_kaggle_test = scaler.transform(test_data)

# Predict probabilities for Kaggle test set
y_prob = xgb_model.predict_proba(X_kaggle_test)[:, 1]  # Probability of rainfall (Class 1)

# ✅ Create Kaggle Submission File
submission = pd.DataFrame({
    "id": test_ids,
    "Probability": y_prob
})

# Save as CSV
submission.to_csv("submission.csv", index=False)
print("\n✅ Submission file saved as 'submission.csv'. Ready for Kaggle upload!")
