In [1]:
from scripts.fe import FeatureEngineering
from scripts.fs import FeatureSelection
from scripts.fs import train_test_split_function
from scripts.fs import check_numerical_columns
from scripts.cross_val import plot_results
from scripts.cross_val import find_best_score
from scripts.cross_val import cross_validate_parameters
from sklearn.ensemble import RandomForestClassifier
from scripts.modelling import prediction_to_csv
from sklearn.metrics import mean_absolute_error
from scripts.lazy_model import lazy_model
from scripts.final_predicter import run_and_save
import pandas as pd
from datetime import datetime
from scripts.fe import mean_encode
from scripts.modelling import tune_model

# Feature Engineering

In [2]:
df,test_df = FeatureEngineering(drop_non_numerical=False, drop_empty_rows=True)
df.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3


In [3]:
print(df.shape, test_df.shape)
df.damage_grade.value_counts(dropna=False)

(260601, 39) (86868, 39)


damage_grade
2    148259
3     87218
1     25124
Name: count, dtype: int64

In [55]:
def target_encoding(dataframe, columns_to_encode, target_variable):
    encoded_df = dataframe.copy()
    for i, column in enumerate(columns_to_encode):
        if i == 0:
            mean = dataframe.groupby(column)[target_variable].mean()
            encoded_df[column] = encoded_df[column].map(mean)

        else:
            mean_two = dataframe.groupby([columns_to_encode[i-1], column])[target_variable].mean()
            print(mean_two.head())
            print(f'grouping by {[columns_to_encode[i-1], column]}')

            # Reset index to turn the hierarchy into a DataFrame
            mean_reset = mean_two.reset_index()
            print(type(mean))

            # Create a unique key in both hierarchy_reset and the original df
            encoded_df['group_key'] = encoded_df[columns_to_encode[i-1]].astype(str) + "_" + encoded_df[column].astype(str)
            mean_reset['group_key'] = mean_reset[columns_to_encode[i-1]].astype(str) + "_" + mean_reset[column].astype(str)

            encoded_feature_name = f'mean_{column}'
            mean_reset[encoded_feature_name] = mean_two.values  # Assuming mean_two is a Series with the mean values
            #print(mean_two.values[0])
            #pd.merge(new_frame, mean_reset.encoded_feature_name, how='right')


            # Check for group keys
            print(f'group keys: {encoded_df.group_key.unique()}')
            print(f'group keys: {mean_reset.group_key.unique()}')


            # Merge the mean damage_grade back to the original df based on the unique key
            print(mean_reset.head(1))
            #encoded_df = encoded_df.merge(mean_reset[['group_key'], column], on='group_key', how='left')
            
            #pd.merge(encoded_df, mean_reset[['group_key', encoded_feature_name]], on='group_key', how='left')
            
            encoded_df = pd.merge(encoded_df, mean_reset[encoded_feature_name], on=[columns_to_encode[i-1], column], how='left')

            print(encoded_df.head())
            # Clean up by removing the temporary 'group_key'
            #encoded_df.drop('group_key', inplace=True, axis=1)

    
    
    return encoded_df


In [56]:
first = target_encoding(df, ['geo_level_1_id'], 'damage_grade')
first.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,2.161724,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,2.485273,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,2.563369,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,2.00096,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,2.337713,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3


In [57]:
second = target_encoding(df, ['geo_level_2_id'], 'damage_grade')
second.head()


Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,2.740741,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,8,2.487437,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,21,2.51875,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,22,2.107317,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,11,2.348748,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3


In [58]:
dichotomy = target_encoding(df, ['geo_level_1_id', 'geo_level_2_id'], 'damage_grade')
#hierarchy = target_encoding(df, ['geo_level_2_id', 'geo_level_3_id'], 'damage_grade')
#mayhem = target_encoding(df, ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], 'damage_grade')

geo_level_1_id  geo_level_2_id
0               62                2.186047
                69                1.922078
                81                2.050847
                114               2.074074
                146               2.238095
Name: damage_grade, dtype: float64
grouping by ['geo_level_1_id', 'geo_level_2_id']
<class 'pandas.core.series.Series'>
group keys: ['2.1617242935072394_487' '2.485272536687631_900' '2.5633689300826115_363'
 ... '2.0618279569892475_77' '2.0618279569892475_115'
 '2.0618279569892475_627']
group keys: ['0_62' '0_69' '0_81' ... '30_1304' '30_1370' '30_1391']
   geo_level_1_id  geo_level_2_id  damage_grade group_key  mean_geo_level_2_id
0               0              62      2.186047      0_62             2.186047


KeyError: 'geo_level_1_id'

In [54]:
dichotomy.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade_x,group_key_x,damage_grade_y,group_key_y,mean_geo_level_2_id
0,2.161724,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,3,2.1617242935072394_487,,,
1,2.485273,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,2,2.485272536687631_900,,,
2,2.563369,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,3,2.5633689300826115_363,,,
3,2.00096,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,2,2.0009596928982725_418,,,
4,2.337713,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,3,2.337712895377129_131,,,


In [17]:
hierarchy.geo_level_2_id.value_counts(dropna=False)

geo_level_2_id
NaN    260601
Name: count, dtype: int64

In [None]:
hierarchy['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'].value_counts(dropna=False)

In [None]:
def simple_target_encoding(dataframe, columns_to_encode, target_variable):
    encoded_df = dataframe.copy()
    for i, column in enumerate(columns_to_encode):
        if i == 0:
            mean = dataframe.groupby(column)[target_variable].mean()
            encoded_df[column] = encoded_df[column].map(mean)

    return encoded_df