In [1]:
%cd /teamspace/studios/this_studio/2024/07/flood_prediction_notebook/flood_prediciton_code

/teamspace/studios/this_studio/2024/07/flood_prediction_notebook/flood_prediciton_code


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
# Basic Libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import mlflow
import gc
from warnings import filterwarnings
filterwarnings('ignore')

# Models Libs
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn import svm as svm
import lightgbm as lgbm
from lightgbm import LGBMRegressor

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV

# Metric
from sklearn.metrics import r2_score

# Mlflow
import mlflow

In [3]:
TEST_SIZE = 0.2
DROP_COLUMNS = ['id']
CV = 3
RANDOM_STATE = 42
ORIGINAL_COLS = ['id','MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'FloodProbability']

# FILTER_COLS = ['id','MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
#        'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality']

# EXT_FEATURES = ['City_Features','TopographyDrainage_meanshift','Climate_Features','Climate_Features_mul','Climate_Features_meanshift','MonsoonIntensity_meanshift','TopographyDrainage_meanshift',
# 'InadequatePlanning_1']
# FEATURES = FILTER_COLS + EXT_FEATURES

# GridSearchCV Params
PARAMS_LINEAR_REGRESSION = {}

PARAMS_XGB = {}

PARAMS_RANDOM_FOREST_REGRESSION = {
    'n_estimators':[100,200], 
    # 'criterion':['squared_error','absolute_error','friedman_mse','poisson']
}

PARAMS_LASSO = {
    'alpha':[0.1,0.01], 
    'max_iter':[100,500]
}

PARAMS_RIDGE = {
    'alpha':[0.1,0.01], 
    'max_iter':[100,500]
}

PARAMS_SVM = {
    'C': [1, 10], 
    'kernel': ['linear', 'rbf']
}

PARAMS_DEFAULT = {}

In [4]:
raw_train = pd.read_csv('../data/train.csv')
raw_valid= pd.read_csv('../data/test.csv')

In [5]:
def create_features(data):
    df = data.copy()
    cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure'] = df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure']=df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['sum']= df.sum(axis=1)
    df['mean']= df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['max'] = df.max(axis=1)
    df['min'] = df.min(axis=1)
    df['var'] = df.var(axis=1)
    df['skew'] = df.skew(axis=1)
    df['kurt'] = df.kurt(axis=1)
    df['meadian'] = df.median(axis=1)
    df['quant_25'] = df.quantile(0.25,axis=1)
    df['quant_75'] = df.quantile(0.75,axis=1)
    df['sum>72'] = np.where(df['sum']>72,1,0)
    df['sum>100'] = np.where(df['sum']>100,1,0)
    df['sum>50'] = np.where(df['sum']>50,1,0)
    df['range']= df['max']-df['min']
    for col in cols:
        df[f"{col}_2"]= df[col]**2
        df[f"{col}_3"]= df[col]**3
        df[f"{col}_3"]= df[col]**4
    for col in cols:
        if col not in ['id','FloodProbability']:
            df[f"mad_{col}"] = df[col] - df[col].median()
            df[f"mean_{col}"] = df[col] - df[col].mean()
            df[f"std_{col}"] = df[col] - df[col].std()
    return df

def process_data(train_data):
    df = train_data.copy()
    df = create_features(df)
    # Drop Columns
    df.drop(columns=DROP_COLUMNS,axis=1,inplace=True)
    return df

def evaluate_model(model_name):
    # Linear Regression Model
    if model_name=='linear_regression':
        model =  GridSearchCV(
            estimator=LinearRegression(),
            param_grid=PARAMS_LINEAR_REGRESSION,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
    
    # LASSO
    if model_name=='lasso':
        model =  GridSearchCV(
            estimator=Lasso(),
            param_grid=PARAMS_LASSO,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)

    # SVM Model
    if model_name=='svm':
        model =  GridSearchCV(
            estimator=svm.SVR(),
            param_grid=PARAMS_SVM,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')

    # XGBoost
    if model_name=='xgb':
        model =  GridSearchCV(
            estimator=xgb.XGBRegressor(),
            param_grid=PARAMS_XGB,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')

    # LightGBM
    if model_name=='lgbm':
        model =  GridSearchCV(
            estimator=LGBMRegressor(),
            param_grid=PARAMS_DEFAULT,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')

    # CatBoost
    if model_name=='catboost':
        model =  GridSearchCV(
            estimator=CatBoostRegressor(),
            param_grid=PARAMS_DEFAULT,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')
    
    # 
    # Return
    return model_name,score,model.best_estimator_,model.best_params_

def run_experiment(model_name,exp_name):
    mlflow.set_tracking_uri('http://127.0.0.1:5000')
    mlflow.set_experiment(exp_name)
    with mlflow.start_run():
        print('Model Name : ',model_name)
        model_name,score,best_model,best_param = evaluate_model(model_name)
        mlflow.log_param('Drop Columns', DROP_COLUMNS)
        mlflow.log_param('Model Name',model_name)
        mlflow.log_params(best_param)
        mlflow.log_param('CV',CV)
        mlflow.log_param('Random State',RANDOM_STATE)
        mlflow.log_param('Features', str(list(valid_X_scaled.columns)))
        mlflow.log_metric('R2',score)
        mlflow.sklearn.log_model(best_model,model_name)
        # mlflow.log_artifact('transformed_data.csv')
    return best_model

def get_submission_csv(model):
    predictions = model.predict(valid_X_scaled)
    sub_df = raw_valid[['id']]
    sub_df['FloodProbability'] = predictions
    sub_df.to_csv('../data/submission.csv',index=False)


In [6]:
# Train Test Split and Scaling
raw_train_X = raw_train.drop('FloodProbability',axis=1)
raw_train_y = raw_train['FloodProbability']

train_X = process_data(raw_train_X)
train_y = raw_train_y

sc = MinMaxScaler()
sc.fit(train_X)
train_X_scaled = sc.transform(train_X)
train_X_scaled = pd.DataFrame(train_X,columns=train_X.columns)

X_train, X_test, y_train, y_test = train_test_split(train_X_scaled, train_y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

valid_X = process_data(raw_valid)
valid_X_scaled = sc.transform(valid_X)
valid_X_scaled = pd.DataFrame(valid_X_scaled,columns=valid_X.columns)

In [7]:
train_X_scaled.shape, valid_X_scaled.shape

((1117957, 149), (745305, 149))

In [8]:
del raw_train_X,raw_train_y,train_X,train_y,train_X_scaled
gc.collect()

0

In [11]:
# md_lr = run_experiment('linear_regression','Without HP Tuning')
# md_svm = run_experiment('svm','Without HP Tuning')
# md_lgbm =run_experiment('lgbm','Without HP Tuning')
md_xgb = run_experiment('xgb','Without HP Tuning')
# md_catboost =run_experiment('catboost','Without HP Tuning')

Model Name :  xgb
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[CV] END .................................................... total time=  47.8s
[CV] END .................................................... total time=  49.5s
[CV] END .................................................... total time=  52.5s
Finished training model xgb
