# Load Functions

In [18]:
from src import (FeatureEngineering, FeatureSelection, train_test_split_function, 
                           check_numerical_columns, find_best_score, run_and_save, 
                           tune_model, compute_mean_encodings, apply_mean_encodings,perform_parameter_tuning)

## Parameter Tuning (Optional)

You can choose to skip the parameter tuning section by setting `skip_parameter_tuning` to `True`. If skipped, the notebook will use default hyperparameters.

To run parameter tuning, set `skip_parameter_tuning` to `False`.


In [19]:
if __name__ == "__main__":
    # You can choose to skip parameter tuning by setting skip_parameter_tuning=True or False
    skip_parameter_tuning = True  # Set this to your desired value
    parameter_tuning_result = perform_parameter_tuning(skip_parameter_tuning=skip_parameter_tuning)

Parameter tuning skipped. Using default hyperparameters.


# Load Data & Feature Engineering

In [20]:
df,test_df = FeatureEngineering(drop_non_numerical=True, drop_empty_rows=True)

In [21]:
building_id = test_df.building_id

In [22]:
selected_features = df.columns[df.columns != "building_id"]

In [23]:
# Use list comprehension to remove 'damage_grade' from the list
selected_features_test = [feature for feature in selected_features if feature != 'damage_grade']

# Assuming FeatureSelection is a function that selects specified features from a dataframe
selected_df = FeatureSelection(df, selected_features)
selected_test_df = FeatureSelection(test_df, selected_features_test)

# Encoding

In [24]:
mean_encodings = compute_mean_encodings(dataframe=selected_df,target_variable='damage_grade', columns_to_encode=['geo_level_1_id','geo_level_2_id','geo_level_3_id'])

In [25]:
selected_df = apply_mean_encodings(dataframe=selected_df, mean_encodings=mean_encodings)

In [26]:
selected_test_df = apply_mean_encodings(dataframe=selected_test_df, mean_encodings=mean_encodings)

In [27]:
check_numerical_columns(selected_df)

Yes


# Train_Test_Split

In [28]:
target_column_name = 'damage_grade'

In [29]:
X_train, X_val, y_train, y_val = train_test_split_function(selected_df,target_column_name)

# Finding best parameters

In [30]:
best_params = find_best_score(X_train, y_train, 2, 10, 2,[0.1,0.01], model='XGB',skip_parameter_tuning=skip_parameter_tuning)

In [31]:
best_params

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

# Cross-Validation and Model-Fit

In [32]:
fitted_model,accuracy = tune_model(X_train, y_train, X_val, y_val, best_params, model = 'XGB')

[0]	Test-mlogloss:1.03592
[1]	Test-mlogloss:0.98283


Parameters: { "n_estimators" } are not used.



[2]	Test-mlogloss:0.93765
[3]	Test-mlogloss:0.89851
[4]	Test-mlogloss:0.86480
[5]	Test-mlogloss:0.83521
[6]	Test-mlogloss:0.80947
[7]	Test-mlogloss:0.78674
[8]	Test-mlogloss:0.76671
[9]	Test-mlogloss:0.74919
[10]	Test-mlogloss:0.73357
[11]	Test-mlogloss:0.71963
[12]	Test-mlogloss:0.70719
[13]	Test-mlogloss:0.69618
[14]	Test-mlogloss:0.68649
[15]	Test-mlogloss:0.67782
[16]	Test-mlogloss:0.66990
[17]	Test-mlogloss:0.66271
[18]	Test-mlogloss:0.65629
[19]	Test-mlogloss:0.65041
[20]	Test-mlogloss:0.64514
[21]	Test-mlogloss:0.64019
[22]	Test-mlogloss:0.63575
[23]	Test-mlogloss:0.63170
[24]	Test-mlogloss:0.62802
[25]	Test-mlogloss:0.62460
[26]	Test-mlogloss:0.62154
[27]	Test-mlogloss:0.61866
[28]	Test-mlogloss:0.61596
[29]	Test-mlogloss:0.61354
[30]	Test-mlogloss:0.61118
[31]	Test-mlogloss:0.60908
[32]	Test-mlogloss:0.60701
[33]	Test-mlogloss:0.60510
[34]	Test-mlogloss:0.60340
[35]	Test-mlogloss:0.60169
[36]	Test-mlogloss:0.60017
[37]	Test-mlogloss:0.59867
[38]	Test-mlogloss:0.59735
[39]	Test

In [33]:
accuracy

0.7440379117821991

# Run validation and print csv

In [34]:
run_and_save(fitted_model,selected_test_df,building_id)

File saved as data/output/predictions_2024-02-03_12-47-16.csv
