# Load Functions

In [2]:
from scripts import (FeatureEngineering, FeatureSelection, train_test_split_function, 
                           check_numerical_columns, find_best_score, run_and_save, 
                           tune_model, compute_mean_encodings, apply_mean_encodings)

# Load Data & Feature Engineering

In [52]:
df,test_df = FeatureEngineering(drop_non_numerical=True, drop_empty_rows=True)

In [53]:
building_id = test_df.building_id

In [54]:
selected_features = df.columns

In [55]:
# Use list comprehension to remove 'damage_grade' from the list
selected_features_test = [feature for feature in selected_features if feature != 'damage_grade']

# Assuming FeatureSelection is a function that selects specified features from a dataframe
selected_df = FeatureSelection(df, selected_features)
selected_test_df = FeatureSelection(test_df, selected_features_test)

# Encoding

In [56]:
mean_encodings = compute_mean_encodings(dataframe=selected_df,target_variable='damage_grade', columns_to_encode=['geo_level_1_id','geo_level_2_id','geo_level_3_id'])

In [57]:
selected_df = apply_mean_encodings(dataframe=selected_df, mean_encodings=mean_encodings)

In [58]:
selected_test_df = apply_mean_encodings(dataframe=selected_test_df, mean_encodings=mean_encodings)

In [59]:
check_numerical_columns(selected_df)

Yes


# Train_Test_Split

In [60]:
target_column_name = 'damage_grade'

In [61]:
X_train, X_val, y_train, y_val = train_test_split_function(selected_df,target_column_name)

# Finding best parameters

In [24]:
results = find_best_score(X_train, y_train, 2, 10, 2,[0.1,0.01], model='XGB')

In [62]:
best_params = None
best_score = float('-inf')  # Initialize to negative infinity

for key, (params, score) in results.items():
    if score > best_score:
        best_score = score
        best_params = params

print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: {'num_class': 3, 'max_depth': 2, 'learning_rate': 0.01, 'num_boost_round': 100}
Best Score: 0.7672138164460327


In [63]:
infolist = [1,2]

# Cross-Validation and Model-Fit

In [64]:
best_params

{'num_class': 3, 'max_depth': 2, 'learning_rate': 0.01, 'num_boost_round': 100}

In [65]:
infolist = []

In [66]:
fitted_model,accuracy = tune_model(X_train, y_train, X_val, y_val, best_params, infolist, model = 'XGB')

[0]	Test-mlogloss:1.09243
[1]	Test-mlogloss:1.08634
[2]	Test-mlogloss:1.08035
[3]	Test-mlogloss:1.07446
[4]	Test-mlogloss:1.06866
[5]	Test-mlogloss:1.06297
[6]	Test-mlogloss:1.05735
[7]	Test-mlogloss:1.05182
[8]	Test-mlogloss:1.04637
[9]	Test-mlogloss:1.04101


Parameters: { "num_boost_round" } are not used.



[10]	Test-mlogloss:1.03573
[11]	Test-mlogloss:1.03057
[12]	Test-mlogloss:1.02544
[13]	Test-mlogloss:1.02043
[14]	Test-mlogloss:1.01542
[15]	Test-mlogloss:1.01055
[16]	Test-mlogloss:1.00568
[17]	Test-mlogloss:1.00096
[18]	Test-mlogloss:0.99623
[19]	Test-mlogloss:0.99163
[20]	Test-mlogloss:0.98704
[21]	Test-mlogloss:0.98258
[22]	Test-mlogloss:0.97812
[23]	Test-mlogloss:0.97375
[24]	Test-mlogloss:0.96948
[25]	Test-mlogloss:0.96521
[26]	Test-mlogloss:0.96106
[27]	Test-mlogloss:0.95690
[28]	Test-mlogloss:0.95286
[29]	Test-mlogloss:0.94881
[30]	Test-mlogloss:0.94482
[31]	Test-mlogloss:0.94094
[32]	Test-mlogloss:0.93706
[33]	Test-mlogloss:0.93329
[34]	Test-mlogloss:0.92951
[35]	Test-mlogloss:0.92578
[36]	Test-mlogloss:0.92216
[37]	Test-mlogloss:0.91853
[38]	Test-mlogloss:0.91499
[39]	Test-mlogloss:0.91146
[40]	Test-mlogloss:0.90803
[41]	Test-mlogloss:0.90460
[42]	Test-mlogloss:0.90121
[43]	Test-mlogloss:0.89791
[44]	Test-mlogloss:0.89466
[45]	Test-mlogloss:0.89140
[46]	Test-mlogloss:0.88823
[

In [67]:
accuracy

0.7289384317261757

# Run validation and print csv

In [68]:
run_and_save(fitted_model,selected_test_df,building_id)

File saved as data/output/predictions_2024-02-03_11-07-23.csv
