# Load Functions

In [1]:
from src import (FeatureEngineering, FeatureSelection, train_test_split_function, 
                           check_numerical_columns, find_best_score, run_and_save, 
                           tune_model, compute_mean_encodings, apply_mean_encodings,perform_parameter_tuning)

## Parameter Tuning (Optional)

You can choose to skip the parameter tuning section by setting `skip_parameter_tuning` to `True`. If skipped, the notebook will use default hyperparameters.

To run parameter tuning, set `skip_parameter_tuning` to `False`.


In [2]:
if __name__ == "__main__":
    # You can choose to skip parameter tuning by setting skip_parameter_tuning=True or False
    skip_parameter_tuning = True  # Set this to your desired value
    parameter_tuning_result = perform_parameter_tuning(skip_parameter_tuning=skip_parameter_tuning)

Parameter tuning skipped. Using default hyperparameters.


# Load Data & Feature Engineering

In [3]:
df,test_df = FeatureEngineering(drop_non_numerical=True, drop_empty_rows=True)

In [4]:
building_id = test_df.building_id

In [5]:
selected_features = df.columns[df.columns != "building_id"]

In [6]:
# Use list comprehension to remove 'damage_grade' from the list
selected_features_test = [feature for feature in selected_features if feature != 'damage_grade']

# Assuming FeatureSelection is a function that selects specified features from a dataframe
selected_df = FeatureSelection(df, selected_features)
selected_test_df = FeatureSelection(test_df, selected_features_test)

# Encoding

In [7]:
mean_encodings = compute_mean_encodings(dataframe=selected_df,target_variable='damage_grade', columns_to_encode=['geo_level_1_id','geo_level_2_id','geo_level_3_id'])

In [8]:
selected_df = apply_mean_encodings(dataframe=selected_df, mean_encodings=mean_encodings)

In [9]:
selected_test_df = apply_mean_encodings(dataframe=selected_test_df, mean_encodings=mean_encodings)

In [10]:
check_numerical_columns(selected_df)

Yes


# Train_Test_Split

In [11]:
target_column_name = 'damage_grade'

In [12]:
X_train, X_val, y_train, y_val = train_test_split_function(selected_df,target_column_name)

# Finding best parameters

In [13]:
best_params = find_best_score(X_train, y_train, 2, 10, 2,[0.1,0.01], model='XGB',skip_parameter_tuning=skip_parameter_tuning)

In [14]:
best_params

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

# Cross-Validation and Model-Fit

In [15]:
fitted_model,accuracy = tune_model(X_train, y_train, X_val, y_val, best_params, model = 'XGB')

[0]	Test-mlogloss:1.03577
[1]	Test-mlogloss:0.98253
[2]	Test-mlogloss:0.93727
[3]	Test-mlogloss:0.89811
[4]	Test-mlogloss:0.86422
[5]	Test-mlogloss:0.83455
[6]	Test-mlogloss:0.80870
[7]	Test-mlogloss:0.78596
[8]	Test-mlogloss:0.76602
[9]	Test-mlogloss:0.74838


Parameters: { "n_estimators" } are not used.



[10]	Test-mlogloss:0.73283
[11]	Test-mlogloss:0.71892
[12]	Test-mlogloss:0.70645
[13]	Test-mlogloss:0.69544
[14]	Test-mlogloss:0.68567
[15]	Test-mlogloss:0.67677
[16]	Test-mlogloss:0.66887
[17]	Test-mlogloss:0.66181
[18]	Test-mlogloss:0.65524
[19]	Test-mlogloss:0.64918
[20]	Test-mlogloss:0.64386
[21]	Test-mlogloss:0.63900
[22]	Test-mlogloss:0.63458
[23]	Test-mlogloss:0.63045
[24]	Test-mlogloss:0.62671
[25]	Test-mlogloss:0.62318
[26]	Test-mlogloss:0.61994
[27]	Test-mlogloss:0.61695
[28]	Test-mlogloss:0.61423
[29]	Test-mlogloss:0.61171
[30]	Test-mlogloss:0.60933
[31]	Test-mlogloss:0.60719
[32]	Test-mlogloss:0.60504
[33]	Test-mlogloss:0.60316
[34]	Test-mlogloss:0.60132
[35]	Test-mlogloss:0.59953
[36]	Test-mlogloss:0.59800
[37]	Test-mlogloss:0.59647
[38]	Test-mlogloss:0.59508
[39]	Test-mlogloss:0.59383
[40]	Test-mlogloss:0.59250
[41]	Test-mlogloss:0.59133
[42]	Test-mlogloss:0.59015
[43]	Test-mlogloss:0.58910
[44]	Test-mlogloss:0.58810
[45]	Test-mlogloss:0.58713
[46]	Test-mlogloss:0.58628
[

In [16]:
accuracy

0.7455152433759905

# Run validation and print csv

In [17]:
run_and_save(fitted_model,selected_test_df,building_id)

File saved as data/output/predictions_2024-02-03_12-45-14.csv
