In [34]:
from scripts.fe import FeatureEngineering
from scripts.fs import FeatureSelection
from scripts.fs import train_test_split_function
from scripts.fs import check_numerical_columns
from scripts.cross_val import find_best_score

from sklearn.ensemble import RandomForestClassifier
from scripts.modelling import prediction_to_csv
from sklearn.metrics import mean_absolute_error
from scripts.lazy_model import lazy_model
from scripts.final_predicter import run_and_save
import pandas as pd
from datetime import datetime
from scripts.fe import mean_encode
from scripts.modelling import tune_model
from scripts.fe import compute_mean_encodings
from scripts.fe import apply_mean_encodings

# Feature Engineering

In [35]:
df,test_df = FeatureEngineering(drop_non_numerical=True, drop_empty_rows=True)

In [36]:
building_id = test_df.building_id

In [37]:
selected_features = df.columns

In [49]:
# Use list comprehension to remove 'damage_grade' from the list
selected_features_test = [feature for feature in selected_features if feature != 'damage_grade']

# Assuming FeatureSelection is a function that selects specified features from a dataframe
selected_df = FeatureSelection(df, selected_features)
selected_test_df = FeatureSelection(test_df, selected_features_test)

# Encoding

In [39]:
mean_encodings = compute_mean_encodings(dataframe=selected_df,target_variable='damage_grade', columns_to_encode=['geo_level_1_id','geo_level_2_id','geo_level_3_id'])

In [40]:
selected_df = apply_mean_encodings(dataframe=selected_df, mean_encodings=mean_encodings)

In [41]:
selected_test_df = apply_mean_encodings(dataframe=selected_test_df, mean_encodings=mean_encodings)

In [19]:
check_numerical_columns(selected_df)

Yes


# Train_Test_Split

In [20]:
target_column_name = 'damage_grade'

In [21]:
X_train, X_val, y_train, y_val = train_test_split_function(selected_df,target_column_name)

# Finding best parameters

In [24]:
results = find_best_score(X_train, y_train, 2, 10, 2,[0.1,0.01], model='XGB')

In [42]:
best_params = None
best_score = float('-inf')  # Initialize to negative infinity

for key, (params, score) in results.items():
    if score > best_score:
        best_score = score
        best_params = params

print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: {'num_class': 3, 'max_depth': 2, 'learning_rate': 0.01, 'num_boost_round': 100}
Best Score: 0.7672138164460327


In [169]:
infolist = [1,2]

# Cross-Validation and Model-Fit

In [43]:
best_params

{'num_class': 3, 'max_depth': 2, 'learning_rate': 0.01, 'num_boost_round': 100}

In [44]:
infolist = []

In [45]:
fitted_model,accuracy = tune_model(X_train, y_train, X_val, y_val, best_params, infolist, model = 'XGB')

[0]	Test-mlogloss:1.09240
[1]	Test-mlogloss:1.08629
[2]	Test-mlogloss:1.08027
[3]	Test-mlogloss:1.07436
[4]	Test-mlogloss:1.06853
[5]	Test-mlogloss:1.06281


Parameters: { "num_boost_round" } are not used.



[6]	Test-mlogloss:1.05716
[7]	Test-mlogloss:1.05161
[8]	Test-mlogloss:1.04616
[9]	Test-mlogloss:1.04076
[10]	Test-mlogloss:1.03548
[11]	Test-mlogloss:1.03021
[12]	Test-mlogloss:1.02508
[13]	Test-mlogloss:1.01994
[14]	Test-mlogloss:1.01497
[15]	Test-mlogloss:1.01006
[16]	Test-mlogloss:1.00514
[17]	Test-mlogloss:1.00038
[18]	Test-mlogloss:0.99560
[19]	Test-mlogloss:0.99098
[20]	Test-mlogloss:0.98634
[21]	Test-mlogloss:0.98185
[22]	Test-mlogloss:0.97734
[23]	Test-mlogloss:0.97294
[24]	Test-mlogloss:0.96863
[25]	Test-mlogloss:0.96431
[26]	Test-mlogloss:0.96005
[27]	Test-mlogloss:0.95592
[28]	Test-mlogloss:0.95178
[29]	Test-mlogloss:0.94776
[30]	Test-mlogloss:0.94374
[31]	Test-mlogloss:0.93983
[32]	Test-mlogloss:0.93593
[33]	Test-mlogloss:0.93207
[34]	Test-mlogloss:0.92833
[35]	Test-mlogloss:0.92458
[36]	Test-mlogloss:0.92094
[37]	Test-mlogloss:0.91729
[38]	Test-mlogloss:0.91369
[39]	Test-mlogloss:0.91019
[40]	Test-mlogloss:0.90669
[41]	Test-mlogloss:0.90327
[42]	Test-mlogloss:0.89986
[43]	

In [47]:
accuracy

0.7315669307956486

# Run validation and print csv

In [48]:
run_and_save(fitted_model,selected_test_df,building_id)

File saved as data/output/predictions_2024-02-03_10-48-32.csv
