# Load Functions

In [5]:
from src import (FeatureEngineering, FeatureSelection, train_test_split_function, 
                           check_numerical_columns, find_best_score, run_and_save, 
                           tune_model, compute_mean_encodings, apply_mean_encodings)

# Load Data & Feature Engineering

In [6]:
df,test_df = FeatureEngineering(drop_non_numerical=True, drop_empty_rows=True)

In [7]:
building_id = test_df.building_id

In [29]:
selected_features = df.columns[df.columns != "building_id"]

In [32]:
# Use list comprehension to remove 'damage_grade' from the list
selected_features_test = [feature for feature in selected_features if feature != 'damage_grade']

# Assuming FeatureSelection is a function that selects specified features from a dataframe
selected_df = FeatureSelection(df, selected_features)
selected_test_df = FeatureSelection(test_df, selected_features_test)

# Encoding

In [35]:
mean_encodings = compute_mean_encodings(dataframe=selected_df,target_variable='damage_grade', columns_to_encode=['geo_level_1_id','geo_level_2_id','geo_level_3_id'])

In [36]:
selected_df = apply_mean_encodings(dataframe=selected_df, mean_encodings=mean_encodings)

In [37]:
selected_test_df = apply_mean_encodings(dataframe=selected_test_df, mean_encodings=mean_encodings)

In [38]:
check_numerical_columns(selected_df)

Yes


# Train_Test_Split

In [39]:
target_column_name = 'damage_grade'

In [40]:
X_train, X_val, y_train, y_val = train_test_split_function(selected_df,target_column_name)

# Finding best parameters

In [16]:
results = find_best_score(X_train, y_train, 2, 10, 2,[0.1,0.01], model='XGB')

In [17]:
best_params = None
best_score = float('-inf')  # Initialize to negative infinity

for key, (params, score) in results.items():
    if score > best_score:
        best_score = score
        best_params = params

print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: {'num_class': 3, 'max_depth': 2, 'learning_rate': 0.01, 'num_boost_round': 100}
Best Score: 0.7660145415496353


In [41]:
infolist = [1,2]

# Cross-Validation and Model-Fit

In [42]:
best_params

{'num_class': 3, 'max_depth': 2, 'learning_rate': 0.01, 'num_boost_round': 100}

In [43]:
infolist = []

In [44]:
X_train

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
110375,2.161724,1.836268,1.863636,3,85,7,7,0,0,0,...,0,0,0,0,0,0,0,0,0,0
172850,2.297726,2.074919,2.071713,1,0,14,3,0,0,0,...,0,1,0,0,0,0,0,0,0,0
63463,2.485273,2.440141,2.250000,2,70,11,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
128620,2.054054,1.948413,2.000000,2,25,5,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
247615,2.341954,2.130790,2.142857,2,25,5,4,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235946,2.331565,2.481379,2.609756,2,15,6,5,1,0,0,...,0,0,0,0,0,0,0,0,0,0
217227,1.730887,1.969626,1.933333,3,15,18,7,1,0,0,...,0,0,0,0,0,0,0,0,0,0
86000,1.926464,1.433054,1.160000,2,25,3,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
218690,2.054054,2.024793,2.000000,2,35,8,7,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [45]:
fitted_model,accuracy = tune_model(X_train, y_train, X_val, y_val, best_params, infolist, model = 'XGB')

[0]	Test-mlogloss:1.09243
[1]	Test-mlogloss:1.08634
[2]	Test-mlogloss:1.08035
[3]	Test-mlogloss:1.07446
[4]	Test-mlogloss:1.06866
[5]	Test-mlogloss:1.06297
[6]	Test-mlogloss:1.05735
[7]	Test-mlogloss:1.05182
[8]	Test-mlogloss:1.04637
[9]	Test-mlogloss:1.04101


Parameters: { "num_boost_round" } are not used.



[10]	Test-mlogloss:1.03573
[11]	Test-mlogloss:1.03057
[12]	Test-mlogloss:1.02544
[13]	Test-mlogloss:1.02043
[14]	Test-mlogloss:1.01542
[15]	Test-mlogloss:1.01055
[16]	Test-mlogloss:1.00568
[17]	Test-mlogloss:1.00096
[18]	Test-mlogloss:0.99623
[19]	Test-mlogloss:0.99163
[20]	Test-mlogloss:0.98704
[21]	Test-mlogloss:0.98258
[22]	Test-mlogloss:0.97812
[23]	Test-mlogloss:0.97375
[24]	Test-mlogloss:0.96948
[25]	Test-mlogloss:0.96521
[26]	Test-mlogloss:0.96106
[27]	Test-mlogloss:0.95690
[28]	Test-mlogloss:0.95286
[29]	Test-mlogloss:0.94881
[30]	Test-mlogloss:0.94482
[31]	Test-mlogloss:0.94094
[32]	Test-mlogloss:0.93706
[33]	Test-mlogloss:0.93329
[34]	Test-mlogloss:0.92951
[35]	Test-mlogloss:0.92578
[36]	Test-mlogloss:0.92216
[37]	Test-mlogloss:0.91853
[38]	Test-mlogloss:0.91499
[39]	Test-mlogloss:0.91146
[40]	Test-mlogloss:0.90803
[41]	Test-mlogloss:0.90460
[42]	Test-mlogloss:0.90121
[43]	Test-mlogloss:0.89791
[44]	Test-mlogloss:0.89466
[45]	Test-mlogloss:0.89140
[46]	Test-mlogloss:0.88823
[

In [46]:
accuracy

0.7289384317261757

# Run validation and print csv

In [47]:
run_and_save(fitted_model,selected_test_df,building_id)

File saved as data/output/predictions_2024-02-03_12-05-49.csv
