In [1]:
import pandas as pd
import numpy as np 

import seaborn as sns 
import matplotlib.pylab as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
# import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from scipy.stats import *

In [18]:
df_test=pd.read_csv("test.csv")
df_train = pd.read_csv('train.csv')

In [4]:
### Calculating the SUM of all the target features

### Getting all the features - that is except "Flood Probability" - target feature
feature_cols = [x for x in df_train.columns if x != "FloodProbability"]

# Create a new column 'fsum' which is the sum of all feature columns for each row
df_train['fsum'] = df_train[feature_cols].sum(axis=1)

In [5]:
df_train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability,fsum
0,0,5,8,5,8,6,4,4,3,3,...,3,3,5,4,7,5,7,3,0.445,94
1,1,6,7,4,4,8,8,3,5,4,...,2,0,3,5,3,3,4,3,0.45,95
2,2,6,5,6,7,3,7,1,5,4,...,3,7,5,6,8,2,3,3,0.53,101
3,3,3,4,6,5,4,8,4,7,6,...,4,7,4,4,6,5,7,5,0.535,107
4,4,5,3,2,6,4,4,3,3,3,...,2,6,6,4,1,2,3,5,0.415,76


In [6]:
X = df_train.loc[:, df_train.columns != "FloodProbability"]
y = df_train['FloodProbability']

# Features Scaling using the Standard Scaler
# scaler = QuantileTransformer()
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [8]:
### Using Liner Regression model


lr = LinearRegression()

### fitting the model
lr.fit(X_train, y_train)

# making the predections
lr_pred = lr.predict(X_test)

### Evaluate the model
r2_score(lr_pred,y_test)

0.8162489547658828

In [9]:
polynomial = Pipeline([
    ('polynomial', PolynomialFeatures(include_bias=False, degree=2)),
    ('model', LinearRegression())
])

### Fit the model
polynomial.fit(X_train, y_train)
poly2_pred = polynomial.predict(X_test)

### Calculating R^2 score
r2_poly2 = r2_score(y_test, poly2_pred)
print(r2_poly2)

0.8452791832341942


In [10]:
sgd_regressor = SGDRegressor(random_state=42)


### Define hyperparameters for fiting the model
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.001, 0.0001, 0.00005],
    'learning_rate': ['constant', 'optimal', 'adaptive'],
    'max_iter': [1000, 2000],
    'eta0': [1, 10, 100]
}
    
### Grid Search Cross Validation- find the best hyperparameters    
grid = GridSearchCV(estimator=sgd_regressor, param_grid=param_grid, n_jobs=-1, cv=3)
grid.fit(X_train, y_train)

### Get the best model from Grid Search
sgd_model = grid.best_estimator_

### Fit the model
sgd_model.fit(X_train, y_train)

#### Make predictions
sgd_pred = sgd_model.predict(X_test)

### Evaluate the model
r2_sgd = r2_score(y_test, sgd_pred)
print("Best parameters found:", grid.best_params_)
print("Best R^2 score:", r2_sgd)

Best parameters found: {'alpha': 0.0001, 'eta0': 100, 'learning_rate': 'adaptive', 'max_iter': 1000, 'penalty': 'l2'}
Best R^2 score: 0.8447353322542847


In [12]:
import xgboost as xgb

xgb_model_new = xgb.XGBRegressor(alpha=0.01, eta=0.1, reg_lambda=1, max_depth=7, min_child_weight=7)

param_grid_XGBn = {
    'n_estimators': [50, 100, 150, 500]
}

xgb_search_new = GridSearchCV(estimator=xgb_model_new, param_grid=param_grid_XGBn, cv=3, verbose=2)
xgb_search_new.fit(X_train, y_train)


xgb_regressor = xgb_search_new.best_estimator_


xgb_regressor.fit(X_train, y_train)


xgb_pred = xgb_regressor.predict(X_test)


r2_xgb = r2_score(y_test, xgb_pred)
print("Best parameters found:", xgb_search_new.best_params_)
print("Best R^2 score:", r2_xgb)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ....................................n_estimators=50; total time=   2.9s
[CV] END ....................................n_estimators=50; total time=   2.8s
[CV] END ....................................n_estimators=50; total time=   2.7s
[CV] END ...................................n_estimators=100; total time=   4.0s
[CV] END ...................................n_estimators=100; total time=   4.2s
[CV] END ...................................n_estimators=100; total time=   4.1s
[CV] END ...................................n_estimators=150; total time=   5.8s
[CV] END ...................................n_estimators=150; total time=   5.8s
[CV] END ...................................n_estimators=150; total time=   6.1s
[CV] END ...................................n_estimators=500; total time=  18.1s
[CV] END ...................................n_estimators=500; total time=  18.8s
[CV] END ...................................n_est

In [13]:
estimators = [('Polynomial Regression', polynomial), 
                  ('SGDRegressor', sgd_model), ('XGBoostRegressor', xgb_regressor)]

In [14]:
from sklearn.ensemble import StackingRegressor


SC = StackingRegressor(estimators=estimators, final_estimator= LinearRegression())

SC = SC.fit(X_train, y_train)


# Evaluate
ensemble_preds = SC.predict(X_test)  

In [15]:
r2_ensemble = r2_score(y_test, ensemble_preds)
print(r2_ensemble)

0.8453854724742271


In [16]:
# fiting the best performace model in overall dataset
SC = SC.fit(X, y)

In [19]:
test_ids = df_test['id']

# removing the ids for the dataset
# df_test.drop(columns=['id'], inplace=True)

# Create a new column 'fsum' which is the sum of all feature columns for each row
df_test['fsum'] = df_test.sum(axis=1) 

# Scale data
df_test = scaler.transform(df_test)

In [20]:
predictions_test = SC.predict(df_test[0].reshape(1,-1))
if predictions_test > 0.55:
    print("Flood Risk Zone - Send alert to people who are in risk zone")
else:
    print("Do nothing")

Flood Risk Zone - Send alert to people who are in risk zone


In [21]:
import pickle

with open("sc_model_new.pkl", "wb") as file:
    pickle.dump(SC, file)

print("Model saved as 'sc_model_new.pkl'")

Model saved as 'sc_model_new.pkl'


In [22]:
with open("sc_model_new.pkl", "rb") as file:
    loaded_model = pickle.load(file)

# Use the loaded model for prediction
prediction = loaded_model.predict(df_test)
print("Prediction from loaded model:", prediction)

Prediction from loaded model: [0.57562389 0.45568986 0.45522268 ... 0.62728772 0.551006   0.5097893 ]
