In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
import os
import random
import pandas as pd
import shutil
from scipy.stats import zscore
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [2]:
# load data
data_path = "../../data/raw/"
train = pd.read_csv(os.path.join(data_path + "train_values.csv"))
labels = pd.read_csv(os.path.join(data_path + "train_labels.csv"))

In [None]:
# inspect hist of the data

def get_hist(df, hist_x_row=4):
    """diagnostic tool to inspect the distribution of the data.
    It returns a histogram of each column in the dataframe.
    """
    n_cols = np.ceil(len(df.columns)/hist_x_row).astype(int)
    plt.figure(figsize=(20, 40))
    for i, col in enumerate(df.columns):
        plt.subplot(n_cols, hist_x_row, i+1)
        df[col].hist()
        plt.title(col + ' - ' + str(df[col].dtype)) 

    plt.subplots_adjust(hspace=0.5, wspace=0.3)


get_hist(train)

In [None]:
# Convert binary vars into objects

for col in train.columns:
    if len(train[col].unique()) == 2:
        print(col, train[col].unique())
        train[col] = train[col].astype('object')

In [None]:
# check outliers

def remove_outliers(df, z_level=3.):
    """Remove outliers from the dataset using zscore.
    """
    num_cols = df.select_dtypes(include='number')
    df_z = num_cols.apply(zscore)
    outliers = abs(df_z) > z_level
    row_to_remove = np.where(outliers.any(axis=1))[0]
    print(f'tot number of outliers: {len(row_to_remove)}')
    for col in df_z.columns:
        print(f'- {col} - number of outliers: {len(df_z[abs(df_z[col]) > z_level])}')
    
    return df.drop(row_to_remove), row_to_remove

train, row_to_remove = remove_outliers(train)

# visualize distributions
get_hist(train.select_dtypes(include='number'), figsize=(20,20))
# get_hist(train_raw.select_dtypes(include='number'), figsize=(20,15))

In [None]:
# grouping vars for scaling

standard_scaler_cols = ['age', 'area_percentage']
robust_scaler_cols = []
baseN_enc_cols = []
ordinal_enc_cols = ['ground_floor_type', 'position']
one_hot_cols = []


In [None]:
# test model

labels = labels.drop(['building_id'], axis=1)
labels = labels.drop(row_to_remove)
train = train.drop(['building_id'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.2, random_state=42)

In [None]:
# buiding pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('standard_scaler', StandardScaler(), standard_scaler_cols),
        ('robust_scaler', RobustScaler(), robust_scaler_cols),
        ('baseN_encoder', ce.BaseNEncoder(cols=baseN_enc_cols), baseN_enc_cols),
        # ('ordinal_encoder', OrdinalEncoder(categories=[...], handle_unknown='use_encoded_value', unknown_value=-1), ordinal_enc_cols),
        ('one_hot_encoder', OneHotEncoder(), one_hot_cols),
    ])

pipeline_lr = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression())
])

# pipeline_rf = Pipeline(steps=[
#     ('preprocessing', preprocessor),
#     ('classifier', RandomForestClassifier())
# ])         


In [None]:
pipeline_lr.fit(X_train, y_train)

In [None]:
# predict and evaluate
models = [pipeline_lr]#, pipeline_rf]#, pipeline_xg]
models_name = ['pipeline_lr', 'pipeline_rf', 'pipeline_xg']

for mod_name, mod in zip(models_name, models):
    y_pred = mod.predict(X_test)
    score = f1_score(y_test, y_pred, average='micro')
    print(f'{mod_name:1}: {score}')