In [28]:
import sys
import os
#import random forrest from sklearn 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
#import xgboost as xgb

sys.path.append(os.path.abspath('../../src'))

# add functions explicitly to the path
from utils import *
from data_cleaning import *
from pipeline_multiple_encoding import MLPipeline 
from pipeline import *
from feature import FeatureEngineering

from sklearn.metrics import f1_score

In [29]:
# load the data
df_X, df_y = load_data_train()

In [30]:
# convert the data to objects
df_X = convert_to_object(df_X)
df_X['geo_level_1_id'] = df_X['geo_level_1_id'].astype('object')
df_X['geo_level_2_id'] = df_X['geo_level_2_id'].astype('object')
df_X['geo_level_3_id'] = df_X['geo_level_3_id'].astype('object')


In [31]:
# drop duplicates
df_X, df_y = drop_duplicates(df_X,df_y)

In [32]:
# remove outliers
outliers_ids = get_outliers_ids(df_X)
df_X, df_y = drop_row(outliers_ids.tolist(), df_X, df_y)
print(df_X.shape)

tot number of outliers: 10526
- count_floors_pre_eq - number of outliers: 2439
- age - number of outliers: 1259
- area_percentage - number of outliers: 3811
- height_percentage - number of outliers: 2407
- count_families - number of outliers: 2325
(233884, 39)


In [34]:
# feat engineering
feature_engineering = FeatureEngineering()

# Scenario 3: All Features + New Features - has_flags
df_X, num_features3, cat_features3 = feature_engineering.transform(df_X, scenario=3)


In [36]:
# specify types of training (CV or normal split)
full_train = True

if full_train:
    # use all data
    X_train, y_train = df_X.drop(['building_id'], axis=1), df_y.drop(['building_id'], axis=1)
    numerical_feature = X_train.select_dtypes(include=['number']).columns.tolist()
    categorical_feature = X_train.select_dtypes(include=['object']).columns.tolist()

else:
    # split data
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42, stratify=df_y['damage_grade'])

    # resampling (resampling)
    #X_train, y_train =  resample_data(X_train, y_train, 'upsample')

    y_train, y_test = y_train.drop(['building_id'], axis=1), y_test.drop(['building_id'], axis=1)
    X_train, X_test = X_train.drop(['building_id'], axis=1), X_test.drop(['building_id'], axis=1)
    numerical_feature = X_train.select_dtypes(include=['number']).columns.tolist()
    categorical_feature = X_train.select_dtypes(include=['object']).columns.tolist()


# Define multiple scalers for numerical features
scalers = {
    **{col: 'standard+robust+log' for col in num_features3},  # Apply both Standard and Robust Scaler
    **{col: 'minmax' for col in []},  # If minmax is used separately
}

# Define multiple encodings for categorical features
encoders = {
    **{col: 'basen+target' for col in ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']},  # Multiple encodings for geo features
    **{col: 'onehot' for col in ['plan_configuration', 'foundation_type', 'ground_floor_type',
                                 "other_floor_type", 'building_material', 'type_of_building',
                                 'position', 'legal_ownership_status', 'roof_type', 'land_surface_condition']},
    **{col: 'binary' for col in ['is_concrete', 'sticking_material']}
}


In [37]:
# from sklearn.base import BaseEstimator, ClassifierMixin

# class SklearnXGBClassifier(BaseEstimator, ClassifierMixin):
#     def __init__(self, **kwargs):
#         self.model = xgb.XGBClassifier(**kwargs)

#     def fit(self, X, y, **kwargs):
#         self.model.fit(X, y, **kwargs)
#         return self

#     def predict(self, X):
#         return self.model.predict(X)

#     def predict_proba(self, X):
#         return self.model.predict_proba(X)

#     def get_params(self, deep=True):
#         return self.model.get_params(deep)

#     def set_params(self, **params):
#         self.model.set_params(**params)
#         return self

In [38]:
# # random forest
# y_train = y_train.squeeze()
# cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# model_RF = RandomForestClassifier(n_estimators=100, random_state=0)

# #pre_proccessor = pipeline_preprocessor(standard_scaler_cols=standard_scaler_cols,baseN_enc_cols=baseN_enc_cols, target_enc_cols=target_enc_cols, binary_enc_cols=binary_enc_cols, one_hot_cols=one_hot_cols, robust_scaler_cols=robust_scaler_cols)
# pipeline_RF = MLPipeline(
#     scalers=scalers, 
#     encoders=encoders,
#     model= RandomForestClassifier(n_estimators=100, random_state=0)
#     )

# #pipeline_RF = classifier_pipeline(pre_processor, model_RF)

# param_grid_RF = {
#     'classifier__n_estimators': [100],  # Number of boosting rounds (trees)
#     #'classifier__learning_rate': [0.1],  # Step size shrinking
#     'classifier__max_depth': [6]  # Maximum depth of the tree
# }

# grid_search_RF = GridSearchCV(pipeline_RF, param_grid_RF, cv=10, scoring='f1_micro', n_jobs=-1)
#grid_search_RF.fit(X_train, y_train)

# print("Best F1 Score for RF:")
# print(grid_search_RF.best_score_)

# without additional features: 0.7274334110715601
# with additional: 0.7239058893894587 

In [39]:
# # XGBoost
# y_train = y_train.squeeze() 
# y_train_adjusted = y_train - 1 
# cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# model_XG = SklearnXGBClassifier(objective='multi:softmax', num_class=3, n_estimators=100, random_state=0)

# pre_proccessor = pipeline_preprocessor(standard_scaler_cols=standard_scaler_cols,baseN_enc_cols=baseN_enc_cols)
# pipeline_XG = classifier_pipeline(pre_proccessor, model_XG)

# param_grid_XG = {
#     'classifier__n_estimators': [150, 200],  # Number of boosting rounds (trees)
#     'classifier__learning_rate': [0.1, .2],  # Step size shrinking
#     'classifier__max_depth': [3,6]  # Maximum depth of the tree
# }

# grid_search_XG = GridSearchCV(pipeline_XG, param_grid=param_grid_XG, cv=cv, scoring='f1_micro', n_jobs=-1)

# Fit the model
#grid_search_XG.fit(X_train, y_train_adjusted)

# print("Best F1 Score for XGBoost:")
# print(grid_search_XG.best_score_)

In [46]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier

# Stacking
y_train = y_train.squeeze() 
#pre_processor = pipeline_preprocessor(standard_scaler_cols=standard_scaler_cols,baseN_enc_cols=baseN_enc_cols)

preprocessing_pipeline = MLPipeline(
    scalers=scalers,
    encoders=encoders,
    model=None  # No model yet, will be added in stacking
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

base_learners = [
    ('nb', Pipeline([('preprocessing', preprocessing_pipeline), ('model', GaussianNB())])),
    ('rf', Pipeline([('preprocessing', preprocessing_pipeline), ('model', RandomForestClassifier(n_estimators=100, random_state=42))])),
    ('dt', Pipeline([('preprocessing', preprocessing_pipeline), ('model', DecisionTreeClassifier(random_state=42))])),
    ('ab', Pipeline([('preprocessing', preprocessing_pipeline), ('model', AdaBoostClassifier(n_estimators=50, random_state=42))])),
    ('lda', Pipeline([('preprocessing', preprocessing_pipeline), ('model', LinearDiscriminantAnalysis())])),
    ('gb', Pipeline([('preprocessing', preprocessing_pipeline), ('model', GradientBoostingClassifier(n_estimators=100, random_state=42))])),
    ('lr', Pipeline([('preprocessing', preprocessing_pipeline), ('model', LogisticRegression(solver='lbfgs', max_iter=100, random_state=42, multi_class='ovr'))])),
    ('knn', Pipeline([('preprocessing', preprocessing_pipeline), ('model', KNeighborsClassifier())])),
    ('lgbm', Pipeline([('preprocessing', preprocessing_pipeline), ('model', LGBMClassifier(random_state=42))])),
    ('et', Pipeline([('preprocessing', preprocessing_pipeline), ('model', ExtraTreesClassifier(n_estimators=40, random_state=42))])),
    ('svc', Pipeline([('preprocessing', preprocessing_pipeline), ('model', SGDClassifier(loss='hinge', alpha=1e-4, random_state=42, max_iter=10, tol=1e-3))])),
    ('catboost', Pipeline([('preprocessing', preprocessing_pipeline), ('model', CatBoostClassifier(iterations=40, learning_rate=0.1, random_state=42, verbose=0))])),
    # ('bagging', Pipeline([('preprocessing', preprocessing_pipeline), ('model', BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42))])),
    ('hgb', Pipeline([('preprocessing', preprocessing_pipeline), ('model', HistGradientBoostingClassifier(random_state=42))])),
    ('knn_2', Pipeline([('preprocessing', preprocessing_pipeline), ('model', KNeighborsClassifier(n_neighbors=15))])),
    ('ridge', Pipeline([('preprocessing', preprocessing_pipeline), ('model', RidgeClassifier(random_state=42))])),
    ('qda', Pipeline([('preprocessing', preprocessing_pipeline), ('model', QuadraticDiscriminantAnalysis())])),
]


# Meta-model for classification
meta_model = LogisticRegression(multi_class='ovr', solver='lbfgs', random_state=42)

# Stacking Classifier with the base learners and meta-model
stacking_model = StackingClassifier(
    estimators=base_learners, 
    final_estimator=meta_model
)

# Cross-validation using Stratified K-Folds
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Evaluate the model using cross-validation and F1 score
#f1_scores = cross_val_score(stacking_model, X_train, y_train, cv=cv, scoring='f1_micro', n_jobs=-1, error_score='raise')
#print(f1_scores)

# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# grid_search_stacking = GridSearchCV(pipeline_stacking, param_grid=param_grid_stacking, cv=cv, scoring='f1_weighted', n_jobs=-1)
# grid_search_stacking.fit(X_train, y_train_adjusted)

# print("Best F1 Score for XGBoost:")
# print(grid_search_stacking.best_score_)

In [47]:
stacking_model.fit(X_train, y_train)
# 30 min

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042889 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 926
[LightGBM] [Info] Number of data points in the train set: 233884, number of used features: 98
[LightGBM] [Info] Start training from score -2.372273
[LightGBM] [Info] Start training from score -0.557762
[LightGBM] [Info] Start training from score -1.095888


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 928
[LightGBM] [Info] Number of data points in the train set: 187107, number of used features: 96
[LightGBM] [Info] Start training from score -2.372284
[LightGBM] [Info] Start training from score -0.557759
[LightGBM] [Info] Start training from score -1.095890




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 927
[LightGBM] [Info] Number of data points in the train set: 187107, number of used features: 96
[LightGBM] [Info] Start training from score -2.372284
[LightGBM] [Info] Start training from score -0.557759
[LightGBM] [Info] Start training from score -1.095890




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 930
[LightGBM] [Info] Number of data points in the train set: 187107, number of used features: 97
[LightGBM] [Info] Start training from score -2.372284
[LightGBM] [Info] Start training from score -0.557759
[LightGBM] [Info] Start training from score -1.095890




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 927
[LightGBM] [Info] Number of data points in the train set: 187107, number of used features: 96
[LightGBM] [Info] Start training from score -2.372284
[LightGBM] [Info] Start training from score -0.557759
[LightGBM] [Info] Start training from score -1.095890




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032922 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 927
[LightGBM] [Info] Number of data points in the train set: 187108, number of used features: 96
[LightGBM] [Info] Start training from score -2.372232
[LightGBM] [Info] Start training from score -0.557774
[LightGBM] [Info] Start training from score -1.095880


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
# test the model
df_test = load_data_test()
df_test = convert_to_object(df_test)
df_test_id = df_test.pop('building_id')

In [49]:
df_test_eng, num_features3, cat_features3 = feature_engineering.transform(df_test, scenario=3)

In [50]:
test_predictions = stacking_model.predict(df_test_eng)
df_submission = pd.DataFrame({'building_id': df_test_id, 'damage_grade': test_predictions})



In [51]:
df_submission

Unnamed: 0,building_id,damage_grade
0,300051,3
1,99355,2
2,890251,3
3,745817,1
4,421793,3
...,...,...
86863,310028,2
86864,663567,2
86865,1049160,2
86866,442785,2


In [52]:
df_submission.to_csv('../../results/20250131_14_38_submission.csv', index=False)

In [66]:
# assess the results
best_model = grid_search_XG.best_estimator_

# Train the model on the full dataset
best_model.fit(X_train, y_train_adjusted)


In [111]:
# without cross val
model = RandomForestClassifier(n_estimators=100, random_state=0)
model_XG = SklearnXGBClassifier(objective='multi:softmax', num_class=3, n_estimators=100, random_state=0)
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1

pre_proccessor = pipeline_preprocessor(standard_scaler_cols=standard_scaler_cols,baseN_enc_cols=baseN_enc_cols)

#model_fit = model_training(X_train,y_train,pipeline)
#pipeline = classifier_pipeline(pre_proccessor, model)

pipeline_XG = classifier_pipeline(pre_proccessor, model_XG)
model_fit_XG = model_training(X_train,y_train_adjusted,pipeline_XG)

y_pred = model_fit_XG.predict(X_test)
score = f1_score(y_test_adjusted, y_pred, average='micro')
print(score)

0.723411078093935




In [None]:


f1_scores = cross_val_score(pipeline_stacking, X_train, y_train_adjusted, cv=cv, scoring='f1_weighted', n_jobs=-1)

In [11]:
y_pred = model_fit.predict(X_test)
score = f1_score(y_test, y_pred, average='micro')
print(score)

0.7214442995489236


In [169]:
# no upsample: 0.7275584154605896  # upsample: 0.7281997562904846