## Parameter Tuning (Optional)

You can choose to skip the parameter tuning section by setting `skip_parameter_tuning` to `True`. If skipped, the notebook will use default hyperparameters.

To run parameter tuning, set `skip_parameter_tuning` to `False`.


In [1]:
# Add this code at the beginning of your notebook
skip_parameter_tuning = True  # Set this to True to skip parameter tuning

# Load Functions

In [2]:
from src import (FeatureEngineering, FeatureSelection, train_test_split_function, 
                           check_numerical_columns, find_best_score, run_and_save, 
                           tune_model, compute_mean_encodings, apply_mean_encodings)

# Load Data & Feature Engineering

In [3]:
df,test_df = FeatureEngineering(drop_non_numerical=True, drop_empty_rows=True)

In [4]:
building_id = test_df.building_id

In [5]:
selected_features = df.columns[df.columns != "building_id"]

In [6]:
# Use list comprehension to remove 'damage_grade' from the list
selected_features_test = [feature for feature in selected_features if feature != 'damage_grade']

# Assuming FeatureSelection is a function that selects specified features from a dataframe
selected_df = FeatureSelection(df, selected_features)
selected_test_df = FeatureSelection(test_df, selected_features_test)

# Encoding

In [7]:
mean_encodings = compute_mean_encodings(dataframe=selected_df,target_variable='damage_grade', columns_to_encode=['geo_level_1_id','geo_level_2_id','geo_level_3_id'])

In [8]:
selected_df = apply_mean_encodings(dataframe=selected_df, mean_encodings=mean_encodings)

In [9]:
selected_test_df = apply_mean_encodings(dataframe=selected_test_df, mean_encodings=mean_encodings)

In [10]:
check_numerical_columns(selected_df)

Yes


# Train_Test_Split

In [11]:
target_column_name = 'damage_grade'

In [12]:
X_train, X_val, y_train, y_val = train_test_split_function(selected_df,target_column_name)

# Finding best parameters

In [13]:
best_params = find_best_score(X_train, y_train, 2, 10, 2,[0.1,0.01], model='XGB',skip_parameter_tuning=skip_parameter_tuning)

In [15]:
best_params

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

In [16]:
infolist = [1,2]

# Cross-Validation and Model-Fit

In [17]:
best_params

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

In [18]:
infolist = []

In [19]:
X_train

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
231255,2.563369,2.831224,2.634146,3,5,7,9,0,1,0,...,0,0,0,0,0,0,0,0,0,0
14355,1.926464,2.131646,2.153846,3,0,9,7,0,1,0,...,0,0,0,0,0,0,0,0,0,0
159804,2.446457,2.402082,2.526316,2,15,17,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
205693,2.297726,2.218579,2.100000,2,10,7,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
154754,1.730887,1.649106,1.892086,1,0,8,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11944,1.960755,1.934426,1.500000,2,50,5,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
98904,2.485273,2.518519,2.184211,1,5,3,3,0,1,0,...,1,0,0,0,0,0,0,0,0,0
127191,2.485273,2.699571,2.590909,2,25,5,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
165274,1.937656,1.948819,2.029412,2,30,5,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
fitted_model,accuracy = tune_model(X_train, y_train, X_val, y_val, best_params, infolist, model = 'XGB')

[0]	Test-mlogloss:1.03535
[1]	Test-mlogloss:0.98199
[2]	Test-mlogloss:0.93611
[3]	Test-mlogloss:0.89653
[4]	Test-mlogloss:0.86249
[5]	Test-mlogloss:0.83258
[6]	Test-mlogloss:0.80637
[7]	Test-mlogloss:0.78327
[8]	Test-mlogloss:0.76308
[9]	Test-mlogloss:0.74503


Parameters: { "n_estimators" } are not used.



[10]	Test-mlogloss:0.72928
[11]	Test-mlogloss:0.71525
[12]	Test-mlogloss:0.70275
[13]	Test-mlogloss:0.69167
[14]	Test-mlogloss:0.68165
[15]	Test-mlogloss:0.67271
[16]	Test-mlogloss:0.66455
[17]	Test-mlogloss:0.65719
[18]	Test-mlogloss:0.65071
[19]	Test-mlogloss:0.64465
[20]	Test-mlogloss:0.63927
[21]	Test-mlogloss:0.63426
[22]	Test-mlogloss:0.62959
[23]	Test-mlogloss:0.62539
[24]	Test-mlogloss:0.62155
[25]	Test-mlogloss:0.61808
[26]	Test-mlogloss:0.61483
[27]	Test-mlogloss:0.61185
[28]	Test-mlogloss:0.60911
[29]	Test-mlogloss:0.60655
[30]	Test-mlogloss:0.60416
[31]	Test-mlogloss:0.60201
[32]	Test-mlogloss:0.59992
[33]	Test-mlogloss:0.59802
[34]	Test-mlogloss:0.59615
[35]	Test-mlogloss:0.59443
[36]	Test-mlogloss:0.59281
[37]	Test-mlogloss:0.59128
[38]	Test-mlogloss:0.58984
[39]	Test-mlogloss:0.58850
[40]	Test-mlogloss:0.58718
[41]	Test-mlogloss:0.58584
[42]	Test-mlogloss:0.58469
[43]	Test-mlogloss:0.58358
[44]	Test-mlogloss:0.58259
[45]	Test-mlogloss:0.58164
[46]	Test-mlogloss:0.58071
[

In [21]:
accuracy

0.749045490301414

# Run validation and print csv

In [22]:
run_and_save(fitted_model,selected_test_df,building_id)

File saved as data/output/predictions_2024-02-03_12-24-11.csv
