In [None]:
import sys, os, subprocess
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
rawtrainingdata = pd.read_csv("../input/costa-rican-household-poverty-prediction/train.csv")
ttest = pd.read_csv("../input/costa-rican-household-poverty-prediction/test.csv")
rawtrainingdata.head(20)

In [None]:
ttest.head(10)

In [None]:
rawtrainingdata['Target'].value_counts()
print(rawtrainingdata.describe())
print("Distribution of the targets")
sns.countplot('Target',data=rawtrainingdata)

In [None]:
ttest.info()

In [None]:
trainingds = rawtrainingdata.drop('Target',axis=1)
alldatads = trainingds.append(ttest,sort=True)
alldatads['dependency'].value_counts()

In [None]:
mapping = {"yes": 1, "no": 0}
alldatads['dependency'] = alldatads['dependency'].replace(mapping).astype(np.float64)
alldatads['edjefa'] = alldatads['edjefa'].replace(mapping).astype(np.float64)
alldatads['edjefe'] = alldatads['edjefe'].replace(mapping).astype(np.float64)

alldatads[['dependency', 'edjefa', 'edjefe']].describe()

In [None]:
alldatads.loc[alldatads['rez_esc'] == 99.0 , 'rez_esc'] = 5

In [None]:
# Number of missing in each column
missing = pd.DataFrame(alldatads.isnull().sum()).rename(columns = {0: 'total'})

# Create a percentage missing
missing['percent'] = missing['total'] / len(alldatads)
missing.sort_values('percent', ascending = False).head(10)

In [None]:
# For tablets per household, pad NaN with 0
alldatads['v18q1'] = alldatads['v18q1'].fillna(0)

# For Monthly rental payment, look at column if household fully own the house, if yes than no rent
alldatads.loc[(alldatads['tipovivi1'] == 1), 'v2a1'] = 0
alldatads.loc[(alldatads['tipovivi2'] == 1), 'v2a1'] = 0
alldatads['v2a1'] = alldatads['v2a1'].fillna(alldatads['v2a1'].median())

# For years behind studies, if it is underage or overage, change NAN to 0
alldatads.loc[((alldatads['age'] > 19) | (alldatads['age'] < 7)) & (alldatads['rez_esc'].isnull()), 'rez_esc'] = 0
alldatads['rez_esc-missing'] = alldatads['rez_esc'].isnull() #kiv
alldatads['rez_esc'] = alldatads['rez_esc'].fillna(alldatads['rez_esc'].median())

# For electricity, create a new column for electricity for categorical
elec = []
for i, row in alldatads.iterrows():
    if row['noelec'] == 1:
        elec.append(0)
    elif row['coopele'] == 1:
        elec.append(1)
    elif row['public'] == 1:
        elec.append(2)
    elif row['planpri'] == 1:
        elec.append(3)
    else:
        elec.append(np.nan)
alldatads['elec'] = elec
alldatads['elec-missing'] = alldatads['elec'].isnull() #kiv
alldatads['elec'] = alldatads['elec'].fillna(alldatads['elec'].mode().tolist()[0])

# drop all useless electricity columns
alldatads = alldatads.drop(columns = ['noelec', 'coopele', 'public', 'planpri'])

# Make a new variable called walls that encapsulates all wall types
alldatads['walls'] = np.argmax(np.array(alldatads[['epared1', 'epared2', 'epared3']]), axis = 1)
alldatads = alldatads.drop(columns = ['epared1', 'epared2', 'epared3'])

# Make a new variable called roof that encapsulates all roof types
alldatads['roof'] = np.argmax(np.array(alldatads[['etecho1', 'etecho2', 'etecho3']]), axis = 1)
alldatads = alldatads.drop(columns = ['etecho1', 'etecho2', 'etecho3'])

# floor that encapsulates all types
alldatads['floor'] = np.argmax(np.array(alldatads[['eviv1', 'eviv2', 'eviv3']]),
                           axis = 1)
alldatads = alldatads.drop(columns = ['eviv1', 'eviv2', 'eviv3'])

# flush that encapsulates all types
alldatads['flush'] = np.argmax(np.array(alldatads[["sanitario1",'sanitario5', 'sanitario2', 'sanitario3',"sanitario6"]]),
                           axis = 1)
alldatads = alldatads.drop(columns = ["sanitario1",'sanitario5', 'sanitario2', 'sanitario3',"sanitario6"])

# drop squared columns
alldatads = alldatads[[x for x in alldatads if not x.startswith('SQB')]]
alldatads = alldatads.drop(columns = ['agesq'])

# water provision that encapsulates all types
alldatads['waterprovision'] = np.argmax(np.array(alldatads[['abastaguano', 'abastaguafuera', 'abastaguadentro']]),
                           axis = 1)
alldatads = alldatads.drop(columns = ['abastaguano', 'abastaguafuera', 'abastaguadentro'])

# education level encapsulating all types
alldatads['inst'] = np.argmax(np.array(alldatads[[c for c in alldatads if c.startswith('instl')]]), axis = 1)
alldatads = alldatads.drop(columns = [c for c in alldatads if c.startswith('instlevel')])

# cooking encapsulating all types
alldatads['waterprovision'] = np.argmax(np.array(alldatads[['energcocinar1','energcocinar4', 'energcocinar3', 'energcocinar2']]),
                           axis = 1)
alldatads = alldatads.drop(columns = ['energcocinar1','energcocinar4', 'energcocinar3', 'energcocinar2'])

# if meaneduc is null take value of years of schooling
alldatads.loc[pd.isnull(alldatads['meaneduc']), 'meaneduc'] = alldatads.loc[pd.isnull(alldatads['meaneduc']), 'escolari']

In [None]:
# Number of missing in each column
missing = pd.DataFrame(alldatads.isnull().sum()).rename(columns = {0: 'total'})

# Create a percentage missing
missing['percent'] = missing['total'] / len(alldatads)
missing.sort_values('percent', ascending = False).head(10)

In [None]:
print(alldatads)

In [None]:
# Split our combine sets into training and test
trainingset = alldatads.iloc[0:9557,:]
testset = alldatads.iloc[9557:33413,:]
trainingset = trainingset.drop(['Id','idhogar'],axis=1)
trainingtargets = rawtrainingdata['Target']

# Obtain copy of full training set
fulltrainingset = trainingset.copy()
fulltrainingsettargets = trainingtargets.copy()

# Split our training set into training and validation
trainingset, trainingtargets = shuffle(trainingset, trainingtargets, random_state = 8)
validationdata, validationtargets, trainingdata, trainingdatatargets = trainingset[:500], trainingtargets[:500], trainingset[500:], trainingtargets[500:]

# Drop all unnecessary columns, but hang on to the test ids
newtestset = testset.copy()
newtestset.drop(['idhogar'],axis=1,inplace=True)
testset.drop(['Id', 'idhogar'],axis=1,inplace=True)


In [None]:
print(trainingdata)

In [None]:
import lightgbm as lgb
import xgboost as xgb
import sklearn.model_selection as model_selection
from sklearn.metrics import f1_score, make_scorer, classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import svm
from time import time
from numpy import mean
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import class_weight
from sklearn.model_selection import learning_curve


In [None]:
# initialise cross-validation parameters
cv = KFold(n_splits=5, random_state=1, shuffle=True)
models_best_f1 = {}

In [None]:
def plot_learning_curve(
    estimator,
    title,
    X,
    y,
    axes=None,
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        return_times=True,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes[0].fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes[0].plot(
        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
    )
    axes[0].plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, "o-")
    axes[1].fill_between(
        train_sizes,
        fit_times_mean - fit_times_std,
        fit_times_mean + fit_times_std,
        alpha=0.1,
    )
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, "o-")
    axes[2].fill_between(
        fit_times_mean,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
    )
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

# LGBM

Hyperparameter tuning for LGBM

In [None]:
# Do gridsearch to find the best model f1_score, Breadth first search down the parameters, gbdt 0.5 120, 49
parameter_grid = {
    'boosttype': ['dart'],            # dart wins hands down here singularly
    'learningrate': [0.1, 0.5, 1.1],
    'num_of_estimators': [100, 130, 150, 170, 200],
    'num_of_leaves': [30, 49, 70],
    'L1 reg': [0.01],              
    'L2 reg': [0.01]              
}


Note: the CV process is commented out as it takes long to tune. Not required for submission.

In [None]:
# best_f1_score = 0
# best_parameters = dict()
# best_lgbm_model = None
# start = time()
# for atype in parameter_grid['boosttype']:
#   for lr in parameter_grid['learningrate']:
#     for est in parameter_grid['num_of_estimators']:
#       for leaves in parameter_grid['num_of_leaves']:
#         for L1 in parameter_grid['L1 reg']:
#           for L2 in parameter_grid['L2 reg']:
#             print("CV with boosttype={}, lr={}, est={}, leaves={}, l1={}, l2={}".format(atype,
#                                                                                        lr,
#                                                                                        est,
#                                                                                        leaves,
#                                                                                        L1,
#                                                                                        L2))
#             lgmodel = lgb.LGBMClassifier(metric = "", num_class = 4)
#             hyperparameters = {'boosting_type': atype,
#                   'colsample_bytree': 0.9843467236959204,
#                   'learning_rate': lr,
#                   'min_child_samples': 44,
#                   'num_leaves': leaves,
#                   'reg_alpha': L1,      
#                   'reg_lambda': L2,     
#                   'subsample': 0.6299872254632797,
#                   'subsample_for_bin': 40611
#                   }
            
#             lgbm_model = lgb.LGBMClassifier(**hyperparameters, class_weight = 'balanced', max_depth=-1, objective = 'multiclass', n_jobs = -1, n_estimators = est)
#             f1_scores = cross_val_score(lgbm_model, trainingset, trainingtargets, scoring='f1_macro', cv=cv, n_jobs=-1)
#             mean_f1_score = round(mean(f1_scores), 3)
#             print("\tf1 score: {}".format(mean_f1_score))
#             if mean_f1_score > best_f1_score:
#                 best_f1_score = mean_f1_score
#                 best_parameters = lgbm_model.get_params()
#                 best_lgbm_model = lgbm_model
# print("Best f1 score: ", best_f1_score)
# print("Best model: ")
# print(best_parameters)
# end = time()
# print("Time elapsed: ", round(end - start, 3))
# models_best_f1['lgbm'] = best_f1_score
# print("Training accuracy: ", accuracy_score(trainingtargets, training_predictions))
# print("F1 Score: ", f1_score(trainingtargets, training_predictions, average='micro'))
# print("Classification Report: ")
# print(classification_report(trainingtargets, training_predictions))

Creation of the tuned LGBM model

In [None]:
# Now train on the full training data using found hyper parameters
lgmodel = lgb.LGBMClassifier(metric = "", num_class = 4)
lgbm_hyperparameters = {'boosting_type': 'dart',
                        'n_estimators': 200,
                        'colsample_bytree': 0.9,
                        'learning_rate': 0.5,
                        'min_child_samples': 44,
                        'num_leaves': 70,
                        'reg_alpha': 0.01,      
                        'reg_lambda': 0.01,     
                        'subsample': 0.6299872254632797,
                        'subsample_for_bin': 40611
                          }

lgbm_model = lgb.LGBMClassifier(**lgbm_hyperparameters, class_weight = 'balanced', max_depth=-1, objective = 'multiclass', n_jobs = -1, random_state=42)


To plot the learning curve of LGBM

In [None]:
# plot_learning_curve(lgbm_model, 'LGBM', trainingset, trainingtargets, cv=cv, n_jobs=-1)


# MLP

Hyperparameter tuning of MLP

In [None]:
hidden_sizes = [128, 256, 512]
hidden_layers = [2, 3, 5, 8]
lrs = [0.002, 0.005, 0.007]
a = [0.03,0.04]

In [None]:
# best_f1_score = 0
# best_parameters = dict()
# best_mlp_model = None
# start = time()
# for hs in hidden_sizes:
#     for num_layers in hidden_layers:
#         for lr in lrs:
#             for alpha in a:
#                 print("CV with hs={}, layers={}, lr={}, alpha={}".format(hs,
#                                                                          num_layers,
#                                                                          lr,
#                                                                          alpha))
#                 hidden_layer_sizes = tuple(hs for i in range(num_layers))
#                 mlp_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=3000,activation = 'relu',solver='adam',random_state=1,learning_rate_init = lr, alpha = alpha)
#                 f1_scores = cross_val_score(mlp_model, trainingset, trainingtargets, scoring='f1_macro', cv=cv, n_jobs=-1)
#                 mean_f1_score = round(mean(f1_scores), 3)
#                 print("\tf1 score: {}".format(mean_f1_score))
#                 if mean_f1_score > best_f1_score:
#                     best_f1_score = mean_f1_score
#                     best_parameters = mlp_model.get_params()
#                     best_mlp_model = mlp_model
# print("Best f1 score: ", best_f1_score)
# print("Best model: ")
# print(best_parameters)
# end = time()
# print("Time elapsed: ", round(end - start, 3))
# models_best_f1['mlp'] = best_f1_score

Creation of the tuned MLP model


In [None]:
hidden_layer_sizes = (128, 128, 128, 128, 128)
lr = 0.005
alpha = 0.03
mlp_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=3000, activation = 'relu',solver='adam',random_state=1,learning_rate_init = lr, alpha = alpha)

To plot the learning curve of MLP

In [None]:
# plot_learning_curve(mlp_model, 'Multilayer Perceptron', trainingset, trainingtargets, cv=cv, n_jobs=-1)

# XGBOOST

Hyperparameter tuning of xgboost

In [None]:
# max_depth is the maximum number of nodes allowed from the root to the farthest leaf of a tree. 
# higher max_depth -> more complex trees but more prone to overfitting
# min_child_weight is the minimum weight required in order to create a new node in the tree.
# lower min_child_weight -> more complex trees but also more prone to overfitting
# subsample is the ratio of training instances.
# eta is the learning rate of the model.
gridsearch_params = [
    (max_depth, min_child_weight, subsample)
    for max_depth in range(15, 24, 2)
    for min_child_weight in range(1, 6, 2)
    for subsample in [i/10. for i in range(8,11)]
]
n_estimators = 200
data_dmatrix = xgb.DMatrix(data=trainingset,label=trainingtargets)
params = {"objective": "multi:softprob", "num_class": 4, 'alpha': 10, 'colsample_bytree': 0.4}

In [None]:
# Define initial best params and f1_score
# best_f1_score = 0
# best_xgb_model = None
# best_params = None
# for max_depth, min_child_weight, subsample in gridsearch_params:
#     print("CV with max_depth={}, min_child_weight={}, subsample={}".format(max_depth, 
#                                                                            min_child_weight,
#                                                                            subsample))

#     # Run CV
#     xgb_model = xgb.XGBClassifier(objective='multi:softprob',
#                                   num_class=4,
#                                   alpha=10,
#                                   colsample_bytree=0.4,
#                                   max_depth=max_depth,
#                                   min_child_weight=min_child_weight,
#                                   subsample=subsample,
#                                   n_estimators=n_estimators,
#                                   early_stopping_rounds=10)
#     f1_scores = cross_val_score(xgb_model, trainingset, trainingtargets, scoring='f1_macro', cv=cv, n_jobs=-1)
#     mean_f1_score = round(mean(f1_scores), 3)
#     print("\tf1 score: {}".format(mean_f1_score))
#     if mean_f1_score > best_f1_score:
#         best_f1_score = mean_f1_score
#         best_parameters = (max_depth, min_child_weight, subsample)
#         best_xgb_model = xgb_model
# print("Best params: \nmax_depth: {}, min_child_weight: {}, subsample: {}. f1 score: {}".format(best_parameters[0], best_parameters[1], best_parameters[2], best_f1_score))
# end = time()
# print("Time elapsed: ", round(end - start, 3))
# models_best_f1['xgb'] = best_f1_score

Creation of the tuned xgboost model

In [None]:
params = {"objective": "multi:softprob", "num_class": 4, 'colsample_bytree': 1, 'alpha': 10, 'max_depth': 13, 'alpha': 10, 'min_child_weight': 1, 'subsample': 0.8, 'eta': 0.1}
xgb_model = xgb.XGBClassifier(objective='multi:softprob',
                              num_class=4,
                              colsample_bytree=0.4,
                              max_depth=21,
                              alpha=10,
                              min_child_weight=1,
                              subsample=1.0,
                              n_estimators=999,
                              early_stopping_rounds=10)


To plot the learning curve of xgboost

In [None]:
# plot_learning_curve(xgb_model, 'XGBoost', trainingset, trainingtargets, cv=cv, n_jobs=-1)

# KNN

Hyperparameter tuning for KNN

In [None]:
k_nearest_neighbours = [2, 5, 10, 20, 50, 100, 200]


In [None]:
# best_f1_score = 0
# best_parameters = dict()
# best_knn_model = None
# start = time()
# for num_neighbours in k_nearest_neighbours:
#     print("CV with num_neighbours={}".format(num_neighbours))
#     knn_model = KNeighborsClassifier(n_neighbors=num_neighbours)
#     f1_scores = cross_val_score(knn_model, trainingset, trainingtargets, scoring='f1_macro', cv=cv, n_jobs=-1)
#     mean_f1_score = round(mean(f1_scores), 3)
#     print("\tf1 score: {}".format(mean_f1_score))
#     if mean_f1_score > best_f1_score:
#         best_f1_score = mean_f1_score
#         best_parameters = knn_model.get_params()
#         best_knn_model = knn_model
# print("Best f1 score: ", best_f1_score)
# print("Best model: ")
# print(best_parameters)
# end = time()
# print("Time elapsed: ", round(end - start, 3))
# models_best_f1['knn'] = best_f1_score

Creation of the tuned KNN model

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=2)

To plot the learning curve of KNN

In [None]:
# plot_learning_curve(knn_model, 'K-Nearest Neighbours', trainingset, trainingtargets, cv=cv, n_jobs=-1)

# Random Forest

Hyperparameter tuning of random forest

In [None]:
n_classifiers = [100, 200, 300, 500, 700, 1000]

In [None]:
# best_f1_score = 0
# best_parameters = dict()
# best_rf_model = None
# start = time()
# for num_classifiers in n_classifiers:
#     print("CV with num_classifiers={}".format(num_classifiers))
#     rf_model = RandomForestClassifier(n_jobs=-1, n_estimators=num_classifiers, class_weight="balanced")
#     f1_scores = cross_val_score(rf_model, trainingset, trainingtargets, scoring='f1_macro', cv=cv, n_jobs=-1)
#     mean_f1_score = round(mean(f1_scores), 3)
#     print("\tf1 score: {}".format(mean_f1_score))
#     if mean_f1_score > best_f1_score:
#         best_f1_score = mean_f1_score
#         best_parameters = rf_model.get_params()
#         best_rf_model = rf_model
# print("Best f1 score: ", best_f1_score)
# print("Best model: ")
# print(best_parameters)
# end = time()
# print("Time elapsed: ", round(end - start, 3))
# models_best_f1['rf'] = best_f1_score

Creation of the tuned random forest model

In [None]:
rf_model = RandomForestClassifier(n_jobs=-1, n_estimators=700, class_weight="balanced")

To plot the learning curve of random forest

In [None]:
# plot_learning_curve(rf_model, 'Random Forest', trainingset, trainingtargets, cv=cv, n_jobs=-1)

# Multi-class SVM

Hyperparameter tuning of SVM

In [None]:
cs = [0.01,0.02,0.03,0.1,0.2,0.3,1,2,3,4,5,6]

In [None]:
# best_f1_score = 0
# best_parameters = dict()
# best_svm_model = None
# start = time()
# # rbf kernel so not tuning degree
# for c in cs:
#     print("CV with c={}".format(c))
#     svm_model = svm.SVC(kernel='rbf', C=c)
#     f1_scores = cross_val_score(svm_model, trainingset, trainingtargets, scoring='f1_macro', cv=cv, n_jobs=-1)
#     mean_f1_score = round(mean(f1_scores), 3)
#     print("\tf1 score: {}".format(mean_f1_score))
#     if mean_f1_score > best_f1_score:
#         best_f1_score = mean_f1_score
#         best_parameters = svm_model.get_params()
#         best_svm_model = svm_model
# print("Best f1 score: ", best_f1_score)
# print("Best model: ")
# print(best_parameters)
# end = time()
# print("Time elapsed: ", round(end - start, 3))
# models_best_f1['svm'] = best_f1_score

Creation of the tuned SVM model

In [None]:
svm_model = svm.SVC(kernel='rbf', C=3)

To plot the learning curve of SVM

In [None]:
# plot_learning_curve(svm_model, 'Support Vector Machine', trainingset, trainingtargets, cv=cv, n_jobs=-1)

# Creating bagging classifiers from models

In [None]:
lgbm_bagging_clf = BaggingClassifier(base_estimator=lgbm_model, n_estimators=1)
mlp_bagging_clf = BaggingClassifier(base_estimator=mlp_model, n_estimators=1)
xgb_bagging_clf = BaggingClassifier(base_estimator=xgb_model, n_estimators=1)
rf_bagging_clf = BaggingClassifier(base_estimator=rf_model, n_estimators=1)
knn_bagging_clf = BaggingClassifier(base_estimator=knn_model, n_estimators=1)
svm_bagging_clf = BaggingClassifier(base_estimator=svm_model, n_estimators=1)

# Combining classifiers into a voting classifier

In [None]:
lgbm_f1 = 0.916
mlp_f1 = 0.772
xgb_f1 = 0.919
rf_f1 = 0.896
knn_f1 = 0.501

weights = [lgbm_f1, mlp_f1, xgb_f1, rf_f1, knn_f1]
models = [('lgbm', lgbm_bagging_clf), ('mlp', mlp_bagging_clf), ('xgb', xgb_bagging_clf), ('rf', rf_bagging_clf), ('knn', knn_bagging_clf)]

# Creating the LGBM bagging ensemble

In [None]:
clfs = []
lgbm_model = lgb.LGBMClassifier(**lgbm_hyperparameters, class_weight = 'balanced', max_depth=-1, objective = 'multiclass', n_jobs = -1, random_seed=39)
lgbm_bagging_clf = BaggingClassifier(base_estimator=lgbm_model, n_estimators=15)
clfs.append(('lgbm{}'.format(i), lgbm_bagging_clf))
combined_voting_clf = VotingClassifier(estimators=clfs, weights=weights, voting='hard')

# Fitting train data and submitting predictions

In [None]:
target_weights = class_weight.compute_sample_weight('balanced', trainingtargets, indices=None)
lgbm_bagging_clf.fit(fulltrainingset, fulltrainingsettargets, sample_weight=target_weights)
test_predictions = lgbm_bagging_clf.predict(testset)
submission = pd.DataFrame({'id': newtestset.Id, 'Target': test_predictions})
submission.to_csv('submission.csv', index=False)