In [1]:
%cd /teamspace/studios/this_studio/2024/07/flood_prediction_notebook/flood_prediciton_code

/teamspace/studios/this_studio/2024/07/flood_prediction_notebook/flood_prediciton_code


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
# Basic Libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import mlflow
import gc
from warnings import filterwarnings
filterwarnings('ignore')

# Models Libs
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn import svm as svm
import lightgbm as lgbm
from lightgbm import LGBMRegressor

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV

# Metric
from sklearn.metrics import r2_score

# Mlflow
import mlflow

In [3]:
TEST_SIZE = 0.2
DROP_COLUMNS = ['id']
CV = 5
RANDOM_STATE = 42
ORIGINAL_COLS = ['id','MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'FloodProbability']

global INITIAL_FEATURES
INITIAL_FEATURES = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']

# GridSearchCV Params
PARAMS_LINEAR_REGRESSION = {}

PARAMS_XGB = {}

PARAMS_RANDOM_FOREST_REGRESSION = {
    'n_estimators':[100,200], 
    # 'criterion':['squared_error','absolute_error','friedman_mse','poisson']
}

PARAMS_LASSO = {
    'alpha':[0.1,0.01], 
    'max_iter':[100,500]
}

PARAMS_RIDGE = {
    'alpha':[0.1,0.01], 
    'max_iter':[100,500]
}

PARAMS_SVM = {
    'C': [1, 10], 
    'kernel': ['linear', 'rbf']
}

PARAMS_DEFAULT = {}

In [4]:
raw_train = pd.read_csv('../data/train.csv')
raw_valid= pd.read_csv('../data/test.csv')

In [5]:
def process_data(train_data,func_num):
    fun_dict = {
        1: create_features_1,
        2: create_features_2,
        3: create_features_3,
        4: create_features_4,
        5: create_features_5,
        6: create_features_6,
        7: create_features_7
    }
    feature_func = fun_dict.get(func_num)
    df = train_data.copy()
    df = feature_func(df)
    # Drop Columns
    # df.drop(columns=DROP_COLUMNS,axis=1,inplace=True)
    return df

def evaluate_model(model_name):
    # Linear Regression Model
    if model_name=='linear_regression':
        model =  GridSearchCV(
            estimator=LinearRegression(),
            param_grid=PARAMS_LINEAR_REGRESSION,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
    
    # LASSO
    if model_name=='lasso':
        model =  GridSearchCV(
            estimator=Lasso(),
            param_grid=PARAMS_LASSO,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)

    # SVM Model
    if model_name=='svm':
        model =  GridSearchCV(
            estimator=svm.SVR(),
            param_grid=PARAMS_SVM,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')

    # XGBoost
    if model_name=='xgb':
        model =  GridSearchCV(
            estimator=xgb.XGBRegressor(),
            param_grid=PARAMS_XGB,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')

    # LightGBM
    if model_name=='lgbm':
        model =  GridSearchCV(
            estimator=LGBMRegressor(),
            param_grid=PARAMS_DEFAULT,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')

    # CatBoost
    if model_name=='catboost':
        model =  GridSearchCV(
            estimator=CatBoostRegressor(),
            param_grid=PARAMS_DEFAULT,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model.fit(X_train,y_train)
        # Predict on Test
        y_pred = model.predict(X_test)
        # Score 
        score = r2_score(y_test, y_pred)
        print(f'Finished training model {model_name}')
    
    # Ensemble
    if model_name=='ensemble':
        model_cat =  GridSearchCV(
            estimator=CatBoostRegressor(),
            param_grid=PARAMS_DEFAULT,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model_cat.fit(X_train,y_train)
        # Predict on Test
        y_pred_cat = model_cat.predict(X_test)

        model_xgb =  GridSearchCV(
            estimator=xgb.XGBRegressor(),
            param_grid=PARAMS_XGB,
            cv=CV,
            n_jobs=-1,
            verbose=2,
            scoring='neg_mean_squared_error')
        # Train Model
        model_xgb.fit(X_train,y_train)
        # Predict on Test
        y_pred_xgb = model_xgb.predict(X_test)

        y_pred_final = (y_pred_xgb + y_pred_cat) / 2

        model = model_xgb
        # Score 
        score = r2_score(y_test, y_pred_final)
        print(f'Finished training model {model_name}')
    
    # 
    # Return
    return model_name,score,model.best_estimator_,model.best_params_

def run_experiment(model_name,exp_desc):
    mlflow.set_tracking_uri('http://127.0.0.1:5000')
    mlflow.set_experiment(model_name)
    with mlflow.start_run():
        print('Model Name : ',model_name)
        model_name,score,best_model,best_param = evaluate_model(model_name)
        mlflow.log_param('drop_columns', DROP_COLUMNS)
        mlflow.log_param('model_name',model_name)
        mlflow.log_param('desc',exp_desc)
        mlflow.log_params(best_param)
        mlflow.log_param('cv',CV)
        mlflow.log_param('random_state',RANDOM_STATE)
        mlflow.log_param('features', str(list(valid_X_scaled.columns)))
        mlflow.log_metric('r2',score)
        mlflow.sklearn.log_model(best_model,model_name)
        # mlflow.log_artifact('transformed_data.csv')
    return best_model

def get_submission_csv(model):
    predictions = model.predict(valid_X_scaled)
    sub_df = raw_valid[['id']]
    sub_df['FloodProbability'] = predictions
    sub_df.to_csv('../data/submission.csv',index=False)


In [6]:
def create_features_1(data):
    df = data.copy()
    cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure'] = df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure']=df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['sum']= df.sum(axis=1)
    df['mean']= df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['max'] = df.max(axis=1)
    df['min'] = df.min(axis=1)
    df['var'] = df.var(axis=1)
    df['skew'] = df.skew(axis=1)
    df['kurt'] = df.kurt(axis=1)
    df['meadian'] = df.median(axis=1)
    df['quant_25'] = df.quantile(0.25,axis=1)
    df['quant_75'] = df.quantile(0.75,axis=1)
    df['sum>72'] = np.where(df['sum']>72,1,0)
    df['sum>100'] = np.where(df['sum']>100,1,0)
    df['sum>50'] = np.where(df['sum']>50,1,0)
    df['range']= df['max']-df['min']
    for col in cols:
        df[f"{col}_2"]= df[col]**2
        df[f"{col}_3"]= df[col]**3
        df[f"{col}_3"]= df[col]**4
    for col in cols:
        if col not in ['id','FloodProbability']:
            df[f"mad_{col}"] = df[col] - df[col].median()
            df[f"mean_{col}"] = df[col] - df[col].mean()
            df[f"std_{col}"] = df[col] - df[col].std()
    return df

def create_features_2(data):
    df = data.copy()
    cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure'] = df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure']=df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['sum']= df.sum(axis=1)
    df['mean']= df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['max'] = df.max(axis=1)
    df['min'] = df.min(axis=1)
    df['var'] = df.var(axis=1)
    df['skew'] = df.skew(axis=1)
    df['kurt'] = df.kurt(axis=1)
    df['meadian'] = df.median(axis=1)
    df['quant_25'] = df.quantile(0.25,axis=1)
    df['quant_75'] = df.quantile(0.75,axis=1)
    df['sum>72'] = np.where(df['sum']>72,1,0)
    df['sum>100'] = np.where(df['sum']>100,1,0)
    df['sum>50'] = np.where(df['sum']>50,1,0)
    df['range']= df['max']-df['min']
    for col in cols:
        df[f"{col}_2"]= df[col]**2
        df[f"{col}_3"]= df[col]**3
        df[f"{col}_3"]= df[col]**4
        # Log Features
        df[f"log_{col}"] = np.log1p(df[col]+1e-4)  
    for col in cols:
        if col not in ['id','FloodProbability']:
            df[f"mad_{col}"] = df[col] - df[col].median()
            df[f"mean_{col}"] = df[col] - df[col].mean()
            df[f"std_{col}"] = df[col] - df[col].std()
    return df

def create_features_3(data):
    df = data.copy()
    cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure'] = df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['ClimateImpact'] = df.MonsoonIntensity+df.ClimateChange
    df['AnthropogenicPressure'] = df.Deforestation+df.Urbanization+df.AgriculturalPractices+df.Encroachments
    df['InfrastructureQuality'] = df.DamsQuality+df.DrainageSystems+df.DeterioratingInfrastructure
    df['CoastalVulnerabilityTotal'] = df.CoastalVulnerability+df.Landslides
    df['PreventiveMeasuresEfficiency'] = df.RiverManagement+df.IneffectiveDisasterPreparedness+df.InadequatePlanning
    df['EcosystemImpact'] = df.WetlandLoss+df.Watersheds
    df['SocioPoliticalContext'] = df.PopulationScore+df.PoliticalFactors
    df['Land_Use_Pressure']=df.Urbanization+df.Deforestation+df.AgriculturalPractices
    df['Environmental_Degradation']=df.Deforestation+df.Siltation+df.WetlandLoss+df.Landslides
    df['Infrastructure_Vulnerability'] = df.DeterioratingInfrastructure+df.InadequatePlanning
    df['Community_Preparednessg']= df.IneffectiveDisasterPreparedness+df.PoliticalFactors
    df['Population_Density_Vulnerable_Areas']= df.PopulationScore+df.CoastalVulnerability
    df['Climate_Change_Impact']= df.ClimateChange+df.MonsoonIntensity
    df['River_Health']= df.RiverManagement+df.DamsQuality
    df['sum']= df.sum(axis=1) # for tree models 
    df['product']= df.product(axis=1)
    df['special1'] = df['sum'].isin(np.arange(72, 76)) # for linear models
    df['special2'] = df['product'].isin(np.arange(72, 76)) 
    df['mean']= df.mean(axis=1)
    df['std'] = df.std(axis=1)
    df['max'] = df.max(axis=1)
    df['min'] = df.min(axis=1)
    df['var'] = df.var(axis=1)
    df['skew'] = df.skew(axis=1)
    df['kurt'] = df.kurt(axis=1)
    df['meadian'] = df.median(axis=1)
    df['quant_25'] = df.quantile(0.25,axis=1)
    df['quant_75'] = df.quantile(0.75,axis=1)
    df['sum>72'] = np.where(df['sum']>72,1,0)
    df['sum>100'] = np.where(df['sum']>100,1,0)
    df['sum>50'] = np.where(df['sum']>50,1,0)
    df['range']= df['max']-df['min']
    for col in cols:
        df[f"{col}_2"]= df[col]**2
        df[f"{col}_3"]= df[col]**3
        df[f"{col}_3"]= df[col]**4
        # Log Features
        df[f"log_{col}"] = np.log1p(df[col]+1e-4)
        df[f"log_{col}"] = np.log2(df[col]+1e-4) 

    for col in cols:
        if col not in ['id','FloodProbability']:
            df[f"mad_{col}"] = df[col] - df[col].median()
            df[f"mean_{col}"] = df[col] - df[col].mean()
            df[f"std_{col}"] = df[col] - df[col].std()
    return df

def create_features_4(data):
    df = data.copy()

    df['fsum'] = df[INITIAL_FEATURES].sum(axis=1) # for tree models
    df['special1'] = df['fsum'].isin(np.arange(72, 76)) # for linear models

    log_features = [f"log_{col}" for col in INITIAL_FEATURES]
    log2_features = [f"log_{col}" for col in INITIAL_FEATURES]

    exp_features = [f"exp_{col}" for col in INITIAL_FEATURES]
    exp2_features = [f"exp2_{col}" for col in INITIAL_FEATURES]
    exp3_features = [f"exp3_{col}" for col in INITIAL_FEATURES]
    exp4_features = [f"exp4_{col}" for col in INITIAL_FEATURES]
    new_cols = []

    df['fsum2'] = df[INITIAL_FEATURES].product(axis=1)
    df['zero_count'] = (df[INITIAL_FEATURES] < 10).sum(axis=1)
    df['one_count'] = (df[INITIAL_FEATURES] > 10).sum(axis=1)
    
    df['special2'] = df['fsum2'].isin(np.arange(72, 76)) 

    for col in INITIAL_FEATURES:
        df[f"log_{col}"] = np.log1p(df[col]+1e-4)  
    df['log_sum'] = df[log_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"log2_{col}"] = np.log2(df[col]+1e-4)  
    df['log2_sum'] = df[log2_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp_{col}"] = 1.2**(df[col])

    df['exp_sum'] = df[exp_features].sum(axis=1)
    df['exp_prod'] = df[exp_features].product(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp2_{col}"] = np.exp(df[col])
    df['exp2_sum'] = df[exp2_features].sum(axis=1)


    for col in INITIAL_FEATURES:
        df[f"exp3_{col}"] = 4**(df[col])
    df['exp3_sum'] = df[exp3_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp4_{col}"] = 6**(df[col])
    df['exp4_sum'] = df[exp4_features].sum(axis=1)

    feats = list(INITIAL_FEATURES)+['fsum','one_count','fsum2','exp_sum','log_sum','log2_sum','exp2_sum','exp3_sum']
    df = df[feats]
    return df 

def create_features_5(data):
    df = data.copy()

    df['fsum'] = df[INITIAL_FEATURES].sum(axis=1) # for tree models
    df['special1'] = df['fsum'].isin(np.arange(72, 76)) # for linear models
    df['special1'] = np.where(df['special1']==True,1,0)

    log_features = [f"log_{col}" for col in INITIAL_FEATURES]
    log2_features = [f"log_{col}" for col in INITIAL_FEATURES]

    exp_features = [f"exp_{col}" for col in INITIAL_FEATURES]
    exp2_features = [f"exp2_{col}" for col in INITIAL_FEATURES]
    exp3_features = [f"exp3_{col}" for col in INITIAL_FEATURES]
    exp4_features = [f"exp4_{col}" for col in INITIAL_FEATURES]
    new_cols = []

    df['fsum2'] = df[INITIAL_FEATURES].product(axis=1)
    df['zero_count'] = (df[INITIAL_FEATURES] < 10).sum(axis=1)
    df['one_count'] = (df[INITIAL_FEATURES] > 10).sum(axis=1)
    
    df['special2'] = df['fsum2'].isin(np.arange(72, 76)) 
    df['special2'] = np.where(df['special2']==True,1,0)
    for col in INITIAL_FEATURES:
        df[f"log_{col}"] = np.log1p(df[col]+1e-4)  
    df['log_sum'] = df[log_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"log2_{col}"] = np.log2(df[col]+1e-4)  
    df['log2_sum'] = df[log2_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp_{col}"] = 1.2**(df[col])

    df['exp_sum'] = df[exp_features].sum(axis=1)
    df['exp_prod'] = df[exp_features].product(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp2_{col}"] = np.exp(df[col])
    df['exp2_sum'] = df[exp2_features].sum(axis=1)


    for col in INITIAL_FEATURES:
        df[f"exp3_{col}"] = 4**(df[col])
    df['exp3_sum'] = df[exp3_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp4_{col}"] = 6**(df[col])
    df['exp4_sum'] = df[exp4_features].sum(axis=1)

    feats = list(INITIAL_FEATURES)+['fsum','one_count','fsum2','exp_sum','log_sum','log2_sum','exp2_sum','exp3_sum']+['special1','special2']
    df = df[feats]
    return df 

def create_features_6(data):
    df = data.copy()

    df['fsum'] = df[INITIAL_FEATURES].sum(axis=1) # for tree models
    df['special1'] = df['fsum'].isin(np.arange(72, 76)) # for linear models
    df['special1'] = np.where(df['special1']==True,1,0)

    log_features = [f"log_{col}" for col in INITIAL_FEATURES]
    log2_features = [f"log_{col}" for col in INITIAL_FEATURES]

    exp_features = [f"exp_{col}" for col in INITIAL_FEATURES]
    exp2_features = [f"exp2_{col}" for col in INITIAL_FEATURES]
    exp3_features = [f"exp3_{col}" for col in INITIAL_FEATURES]
    exp4_features = [f"exp4_{col}" for col in INITIAL_FEATURES]
    new_cols = []

    df['fsum2'] = df[INITIAL_FEATURES].product(axis=1)
    df['zero_count'] = (df[INITIAL_FEATURES] < 10).sum(axis=1)
    df['one_count'] = (df[INITIAL_FEATURES] > 10).sum(axis=1)
    
    df['special2'] = df['fsum2'].isin(np.arange(72, 76)) 
    df['special2'] = np.where(df['special2']==True,1,0)
    for col in INITIAL_FEATURES:
        df[f"log_{col}"] = np.log1p(df[col]+1e-4)  
    df['log_sum'] = df[log_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"log2_{col}"] = np.log2(df[col]+1e-4)  
    df['log2_sum'] = df[log2_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp_{col}"] = 1.2**(df[col])

    df['exp_sum'] = df[exp_features].sum(axis=1)
    df['exp_prod'] = df[exp_features].product(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp2_{col}"] = np.exp(df[col])
    df['exp2_sum'] = df[exp2_features].sum(axis=1)


    for col in INITIAL_FEATURES:
        df[f"exp3_{col}"] = 4**(df[col])
    df['exp3_sum'] = df[exp3_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp4_{col}"] = 6**(df[col])
    df['exp4_sum'] = df[exp4_features].sum(axis=1)

    feats = list(INITIAL_FEATURES)+['fsum','one_count','fsum2','exp_sum','log_sum','log2_sum','exp2_sum','exp3_sum']+log_features+['special1','special2']
    df = df[feats]
    return df 

def create_features_7(data):
    df = data.copy()

    df['fsum'] = df[INITIAL_FEATURES].sum(axis=1) # for tree models
    df['special1'] = df['fsum'].isin(np.arange(72, 76)) # for linear models
    df['special1'] = np.where(df['special1']==True,1,0)

    log_features = [f"log_{col}" for col in INITIAL_FEATURES]
    log2_features = [f"log_{col}" for col in INITIAL_FEATURES]

    exp_features = [f"exp_{col}" for col in INITIAL_FEATURES]
    exp2_features = [f"exp2_{col}" for col in INITIAL_FEATURES]
    exp3_features = [f"exp3_{col}" for col in INITIAL_FEATURES]
    exp4_features = [f"exp4_{col}" for col in INITIAL_FEATURES]
    new_cols = []

    df['fsum2'] = df[INITIAL_FEATURES].product(axis=1)
    df['zero_count'] = (df[INITIAL_FEATURES] < 10).sum(axis=1)
    df['one_count'] = (df[INITIAL_FEATURES] > 10).sum(axis=1)
    
    df['special2'] = df['fsum2'].isin(np.arange(72, 76)) 
    df['special2'] = np.where(df['special2']==True,1,0)
    for col in INITIAL_FEATURES:
        df[f"log_{col}"] = np.log1p(df[col]+1e-4)  
    df['log_sum'] = df[log_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"log2_{col}"] = np.log2(df[col]+1e-4)  
    df['log2_sum'] = df[log2_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp_{col}"] = 1.2**(df[col])

    df['exp_sum'] = df[exp_features].sum(axis=1)
    df['exp_prod'] = df[exp_features].product(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp2_{col}"] = np.exp(df[col])
    df['exp2_sum'] = df[exp2_features].sum(axis=1)


    for col in INITIAL_FEATURES:
        df[f"exp3_{col}"] = 4**(df[col])
    df['exp3_sum'] = df[exp3_features].sum(axis=1)

    for col in INITIAL_FEATURES:
        df[f"exp4_{col}"] = 6**(df[col])
    df['exp4_sum'] = df[exp4_features].sum(axis=1)

    feats = list(INITIAL_FEATURES)+['fsum','fsum2','exp_sum','log_sum','exp2_sum','exp3_sum']
    df = df[feats]
    return df 


In [57]:
# Train Test Split and Scaling
raw_train_X = raw_train.drop('FloodProbability',axis=1)
raw_train_y = raw_train['FloodProbability']

feature_func = 7

train_X = process_data(raw_train_X,feature_func)
train_y = raw_train_y

sc = MinMaxScaler()
train_X_scaled = sc.fit_transform(train_X)
train_X_scaled = pd.DataFrame(train_X_scaled,columns=train_X.columns)
# train_X_scaled = train_X.copy()

X_train, X_test, y_train, y_test = train_test_split(train_X_scaled, train_y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

valid_X = process_data(raw_valid,feature_func)
valid_X_scaled = sc.transform(valid_X)
valid_X_scaled = pd.DataFrame(valid_X_scaled,columns=valid_X.columns)
# valid_X_scaled = valid_X.copy()

In [8]:
del raw_train_X,raw_train_y,train_X,train_y,train_X_scaled
gc.collect()

0

In [9]:
Experiment_Desc = 'Feature Func : 7 , CV = 6'
# md_lr = run_experiment('linear_regression',Experiment_Desc)
# md_lgbm =run_experiment('lgbm',Experiment_Desc)
md_xgb = run_experiment('xgb',Experiment_Desc)
# md_catboost =run_experiment('catboost',Experiment_Desc)

Model Name :  xgb
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .................................................... total time=  15.3s
[CV] END .................................................... total time=  15.3s
[CV] END .................................................... total time=  15.4s
[CV] END .................................................... total time=  15.4s
[CV] END .................................................... total time=   8.4s
Finished training model xgb


In [11]:
get_submission_csv(md_xgb)