In [20]:
%cd /teamspace/studios/this_studio/2024/07/flood_prediction_notebook/flood_prediciton_code

/teamspace/studios/this_studio/2024/07/flood_prediction_notebook/flood_prediciton_code


In [21]:
# Basic Libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import mlflow
import gc
from warnings import filterwarnings
filterwarnings('ignore')

# Models Libs
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn import svm as svm
import lightgbm as lgbm
from lightgbm import LGBMRegressor

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV

# Metric
from sklearn.metrics import r2_score

# Mlflow
import mlflow

In [22]:
TEST_SIZE = 0.2
DROP_COLUMNS = ['id']
CV = 3
RANDOM_STATE = 42
ORIGINAL_COLS = ['id','MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'FloodProbability']

# GridSearchCV Params
PARAMS_LINEAR_REGRESSION = {}

PARAMS_XGB = {}

PARAMS_RANDOM_FOREST_REGRESSION = {
    'n_estimators':[100,200], 
    # 'criterion':['squared_error','absolute_error','friedman_mse','poisson']
}

PARAMS_LASSO = {
    'alpha':[0.1,0.01], 
    'max_iter':[100,500]
}

PARAMS_RIDGE = {
    'alpha':[0.1,0.01], 
    'max_iter':[100,500]
}

PARAMS_SVM = {
    'C': [1, 10], 
    'kernel': ['linear', 'rbf']
}

PARAMS_DEFAULT = {}

In [23]:
raw_train = pd.read_csv('../data/train.csv')
raw_valid= pd.read_csv('../data/test.csv')

In [24]:
def process_data(train_data,func_num):
    fun_dict = {
        1: create_features_1,
        2: create_features_2,
        3: create_features_3,
    }
    feature_func = fun_dict.get(func_num)
    df = train_data.copy()
    df = feature_func(df)
    # Drop Columns
    df.drop(columns=DROP_COLUMNS,axis=1,inplace=True)
    return df

def evaluate_model(model_name):
    # Linear Regression Model
    if model_name=='linear_regression':
        model =  GridSearchCV(
            estimator=LinearRegression(),
            param_grid=PARAMS_LINEAR_REGRESSION,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
    
    # LASSO
    if model_name=='lasso':
        model =  GridSearchCV(
            estimator=Lasso(),
            param_grid=PARAMS_LASSO,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)

    # SVM Model
    if model_name=='svm':
        model =  GridSearchCV(
            estimator=svm.SVR(),
            param_grid=PARAMS_SVM,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')

    # XGBoost
    if model_name=='xgb':
        model =  GridSearchCV(
            estimator=xgb.XGBRegressor(),
            param_grid=PARAMS_XGB,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')

    # LightGBM
    if model_name=='lgbm':
        model =  GridSearchCV(
            estimator=LGBMRegressor(),
            param_grid=PARAMS_DEFAULT,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')

    # CatBoost
    if model_name=='catboost':
        model =  GridSearchCV(
            estimator=CatBoostRegressor(),
            param_grid=PARAMS_DEFAULT,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')
    
    # Ensemble
    if model_name=='ensemble':
        model_cat =  GridSearchCV(
            estimator=CatBoostRegressor(),
            param_grid=PARAMS_DEFAULT,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model_cat.fit(X_train,y_train)
        # Predict on Test
        y_pred_cat = model_cat.predict(X_test)

        model_xgb =  GridSearchCV(
            estimator=xgb.XGBRegressor(),
            param_grid=PARAMS_XGB,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model_xgb.fit(X_train,y_train)
        # Predict on Test
        y_pred_xgb = model_xgb.predict(X_test)

        y_pred_final = (y_pred_xgb + y_pred_cat) / 2

        model = model_xgb
        # Score 
        score = r2_score(y_test, y_pred_final)
        print(f'Finished training model {model_name}')
    
    # 
    # Return
    return model_name,score,model.best_estimator_,model.best_params_

def run_experiment(model_name,exp_desc):
    mlflow.set_tracking_uri('http://127.0.0.1:5000')
    mlflow.set_experiment(model_name)
    with mlflow.start_run():
        print('Model Name : ',model_name)
        model_name,score,best_model,best_param = evaluate_model(model_name)
        mlflow.log_param('drop_columns', DROP_COLUMNS)
        mlflow.log_param('model_name',model_name)
        mlflow.log_param('desc',exp_desc)
        mlflow.log_params(best_param)
        mlflow.log_param('cv',CV)
        mlflow.log_param('random_state',RANDOM_STATE)
        mlflow.log_param('features', str(list(valid_X_scaled.columns)))
        mlflow.log_metric('r2',score)
        mlflow.sklearn.log_model(best_model,model_name)
        # mlflow.log_artifact('transformed_data.csv')
    return best_model

def get_submission_csv(model):
    predictions = model.predict(valid_X_scaled)
    sub_df = raw_valid[['id']]
    sub_df['FloodProbability'] = predictions
    sub_df.to_csv('../data/submission.csv',index=False)


In [25]:
def create_features_1(data):
    df = data.copy()
    cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure'] = df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure']=df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['sum']= df.sum(axis=1)
    df['mean']= df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['max'] = df.max(axis=1)
    df['min'] = df.min(axis=1)
    df['var'] = df.var(axis=1)
    df['skew'] = df.skew(axis=1)
    df['kurt'] = df.kurt(axis=1)
    df['meadian'] = df.median(axis=1)
    df['quant_25'] = df.quantile(0.25,axis=1)
    df['quant_75'] = df.quantile(0.75,axis=1)
    df['sum>72'] = np.where(df['sum']>72,1,0)
    df['sum>100'] = np.where(df['sum']>100,1,0)
    df['sum>50'] = np.where(df['sum']>50,1,0)
    df['range']= df['max']-df['min']
    for col in cols:
        df[f"{col}_2"]= df[col]**2
        df[f"{col}_3"]= df[col]**3
        df[f"{col}_3"]= df[col]**4
    for col in cols:
        if col not in ['id','FloodProbability']:
            df[f"mad_{col}"] = df[col] - df[col].median()
            df[f"mean_{col}"] = df[col] - df[col].mean()
            df[f"std_{col}"] = df[col] - df[col].std()
    return df

def create_features_2(data):
    df = data.copy()
    cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure'] = df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure']=df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['sum']= df.sum(axis=1)
    df['mean']= df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['max'] = df.max(axis=1)
    df['min'] = df.min(axis=1)
    df['var'] = df.var(axis=1)
    df['skew'] = df.skew(axis=1)
    df['kurt'] = df.kurt(axis=1)
    df['meadian'] = df.median(axis=1)
    df['quant_25'] = df.quantile(0.25,axis=1)
    df['quant_75'] = df.quantile(0.75,axis=1)
    df['sum>72'] = np.where(df['sum']>72,1,0)
    df['sum>100'] = np.where(df['sum']>100,1,0)
    df['sum>50'] = np.where(df['sum']>50,1,0)
    df['range']= df['max']-df['min']
    for col in cols:
        df[f"{col}_2"]= df[col]**2
        df[f"{col}_3"]= df[col]**3
        df[f"{col}_3"]= df[col]**4
        # Log Features
        df[f"log_{col}"] = np.log1p(df[col]+1e-4)  
    for col in cols:
        if col not in ['id','FloodProbability']:
            df[f"mad_{col}"] = df[col] - df[col].median()
            df[f"mean_{col}"] = df[col] - df[col].mean()
            df[f"std_{col}"] = df[col] - df[col].std()
    return df

def create_features_3(data):
    pass

In [26]:
# Train Test Split and Scaling
raw_train_X = raw_train.drop('FloodProbability',axis=1)
raw_train_y = raw_train['FloodProbability']

train_X = process_data(raw_train_X,1)
train_y = raw_train_y

sc = MinMaxScaler()
sc.fit(train_X)
train_X_scaled = sc.transform(train_X)
train_X_scaled = pd.DataFrame(train_X,columns=train_X.columns)

X_train, X_test, y_train, y_test = train_test_split(train_X_scaled, train_y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

valid_X = process_data(raw_valid,1)
valid_X_scaled = sc.transform(valid_X)
valid_X_scaled = pd.DataFrame(valid_X_scaled,columns=valid_X.columns)

In [None]:
train_X_scaled.shape, valid_X_scaled.shape

((1117957, 149), (745305, 149))

In [None]:
del raw_train_X,raw_train_y,train_X,train_y,train_X_scaled
gc.collect()

0

In [None]:
Experiment_Desc = 'All Features + Log Features'
md_lr = run_experiment('linear_regression',Experiment_Desc)
md_lgbm =run_experiment('lgbm',Experiment_Desc)
md_xgb = run_experiment('xgb',Experiment_Desc)
md_catboost =run_experiment('catboost',Experiment_Desc)

2024/07/12 17:28:15 INFO mlflow.tracking.fluent: Experiment with name 'linear_regression' does not exist. Creating a new experiment.


Model Name :  linear_regression
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END .................................................... total time=  36.9s
[CV] END .................................................... total time=  38.2s
[CV] END .................................................... total time=  38.8s


2024/07/12 17:29:40 INFO mlflow.tracking.fluent: Experiment with name 'lgbm' does not exist. Creating a new experiment.


Model Name :  lgbm
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.545334 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5097
[LightGBM] [Info] Number of data points in the train set: 596243, number of used features: 146
[LightGBM] [Info] Start training from score 0.504457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.274513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5097
[LightGBM] [Info] Number of data points in the train set: 596244, number of used features: 146
[LightGBM] [Info] Start training from score 0.504488
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.904687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGB

2024/07/12 17:33:01 INFO mlflow.tracking.fluent: Experiment with name 'xgb' does not exist. Creating a new experiment.


Model Name :  xgb
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END .................................................... total time=  46.0s
[CV] END .................................................... total time=  47.5s
[CV] END .................................................... total time=  49.7s
Finished training model xgb


2024/07/12 17:34:32 INFO mlflow.tracking.fluent: Experiment with name 'catboost' does not exist. Creating a new experiment.


Model Name :  catboost
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Learning rate set to 0.112382
Learning rate set to 0.112382
0:	learn: 0.0476190	total: 595ms	remaining: 9m 54s
0:	learn: 0.0476323	total: 712ms	remaining: 11m 51s
1:	learn: 0.0447045	total: 1.18s	remaining: 9m 50s
1:	learn: 0.0447105	total: 1.3s	remaining: 10m 47s
2:	learn: 0.0422128	total: 1.89s	remaining: 10m 28s
2:	learn: 0.0422148	total: 1.94s	remaining: 10m 44s
3:	learn: 0.0400816	total: 2.56s	remaining: 10m 36s
3:	learn: 0.0401146	total: 2.66s	remaining: 11m 2s
4:	learn: 0.0382983	total: 3.19s	remaining: 10m 35s
4:	learn: 0.0383497	total: 3.33s	remaining: 11m 2s
5:	learn: 0.0367705	total: 3.74s	remaining: 10m 19s
5:	learn: 0.0367230	total: 4.19s	remaining: 11m 33s
Learning rate set to 0.112382
6:	learn: 0.0353820	total: 4.47s	remaining: 10m 33s
6:	learn: 0.0353392	total: 4.77s	remaining: 11m 17s
7:	learn: 0.0342422	total: 5.13s	remaining: 10m 36s
7:	learn: 0.0341700	total: 5.24s	remaining: 10m 50s
0