# Исследование классификаторов на примере решения задачи Kaggle Titanic. Бондаренко В.А. з5130902/00201

## Загрузка, предобработка и подготовка данных

In [None]:
import pandas as pd
import numpy as np
import statistics
import ydf

print(f"Found YDF {ydf.__version__}")
pd.options.display.float_format = '{:.4f}'.format

# Загрузка данных
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

train_df.head(5)

# Конвертация предсказаний модели в формат kaggle
def prediction_to_kaggle_format(proba_survive, threshold=0.5):
    return pd.DataFrame({
        "PassengerId": preprocessed_val_df["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })

def make_submission(kaggle_predictions, sub_path):
    path=sub_path
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")

def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    return np.nan


# Предобработка данных
# 1. Токенизация имён
# 2. Извлечение номера билета, если возможно
def preprocess(df):
    df = df.copy()

    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])

    def is_int(s):
        if s[0] in ('-', '+'):
            return s[1:].isdigit()
        return s.isdigit()

    def ticket_number(x):
        val = x.split(" ")[-1]
        return int(val) if is_int(val) else 0

    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])

    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)
    return df


# Обработка данных, шаг 1
def phase1clean(df):
    # Заменяем нули на NaN
    df.Fare = df.Fare.map(lambda x: np.nan if x == 0 else x)
    df.Fare = df.Fare.replace({ 512.3292 : 7.25})

    # Заменяем NaN каюты на "Неизвестно"
    df.Cabin = df.Cabin.fillna('Unknown')

    # Заменяем NaN цены билета на среднюю цену
    meanFare = np.mean(df.Fare)
    df.Fare = df.Fare.fillna(meanFare)

    # Заменяем NaN Age на средний Age
    meanAge = np.mean(df.Age)
    df.Age = df.Age.fillna(meanAge)

    # Заменяем NaN Embarked моду Embarked
    modeEmbarked = statistics.mode(df.Embarked)[0][0]
    df.Embarked = df.Embarked.fillna(modeEmbarked)

    # Создаём столбец "Титул" из "Имени"
    title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                'Dr', 'Ms', 'Mlle', 'Col', 'Capt', 'Mme', 'the Countess',
                'Dona', 'Don', 'Jonkheer', 'Lady']

    df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))

    normalized_titles = {
        "Capt":       "Officer",
        "Col":        "Officer",
        "Major":      "Officer",
        "Jonkheer":   "Royal",
        "Don":        "Royal",
        "Sir" :       "Royal",
        "Dr":         "Master",
        "Rev":        "Master",
        "the Countess":"Royal",
        "Dona":       "Royal",
        "Mme":        "Mrs",
        "Mlle":       "Miss",
        "Ms":         "Mrs",
        "Mr" :        "Mr",
        "Mrs" :       "Mrs",
        "Miss" :      "Miss",
        "Master" :    "Master",
        "Lady" :      "Royal"
    }

    # Ре-Мэппинг титулов
    df.Title = df.Title.map(normalized_titles)

    # Создаём столбец "Палуба" из "Каюты"
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck'] = df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

    return df

# Обработка данных, шаг 2
def phase2clean(df):
    # Размер семьи из косвенных полей
    #df['Family_Size'] = df['SibSp'] + df['Parch']

    # Цена билета
    #df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size']+1)

    # Возраст * class
    #df['Age*Class'] = df['Age'] * df['Pclass']

    # Выбрасываем лишние столбцы
    df = df.drop(['Ticket'], axis=1)
    df = df.drop(['Cabin'], axis=1)

    return df

def clean_data(train_data, test_data):
    # Сомнительные пассажиры (nan в стоимости проезда)
    # train_data = train_data[(train_data.PassengerId != 259) & (train_data.PassengerId != 680) & (train_data.PassengerId != 738)]

    train_data = phase1clean(train_data)
    test_data = phase1clean(test_data)

    train_data = phase2clean(train_data)
    test_data = phase2clean(test_data)

    train_data.isna().sum()
    train_data.info()

    test_data.isna().sum()
    test_data.info()

    return [train_data, test_data]


# Выбираем столбцы признаков, которые будем использорвать для обучения
do_data_cleaning = True

preprocessed_train_df = preprocess(train_df)
preprocessed_val_df = preprocess(test_df)

# Дополнительно, фильтруем данные в датасете
if do_data_cleaning:
    preprocessed_train_df, preprocessed_val_df = clean_data(preprocessed_train_df, preprocessed_val_df)

# Выведем информацию о датасете и первые 10 строк таблицы
preprocessed_train_df.info()
preprocessed_train_df.head(10)


## Визуализация распределения части исходных данных в виде гистограмм

In [None]:
import matplotlib.pyplot as plt

# Визуализируем распределение части данных в виде гистограмм
for column in preprocessed_train_df.columns:
    if  column == 'Name' or \
        column == 'PassengerId' or \
        column == 'Cabin' or \
        column == 'Ticket_item' or \
        column == 'Ticket_number':
        continue

    plt.hist(preprocessed_train_df[column])
    plt.title(column)
    plt.show()


## Автоматический подбор оптимальных параметров модели при помощи RandomSearchTuner

In [None]:
import os
tmpl = ydf.GradientBoostedTreesLearner.hyperparameter_templates()

# Автоматический подбор оптимальных параметров модели при помощи RandomSearchTuner
tuner = ydf.RandomSearchTuner(num_trials=150)
tuner.choice("shrinkage", [0.325, 0.3, 0.275])
tuner.choice("subsample", [1.0, 0.95, 0.9])
tuner.choice("max_depth", [3, 4, 5, 6, 7])
tuner.choice("num_candidate_attributes_ratio", [0.425, 0.4, 0.375, 0.35])
tuner.choice("num_trees", [10, 50, 100, 300])

model = ydf.GradientBoostedTreesLearner(label="Survived",
                                        # Такие дефолтные параметры подобрались RandomSearchTuner
                                        # (однако генерируемые без параметров модели работают в целом лучше, при валидации на kaggle):
                                        # num_trees=100,
                                        # tuner=tuner,
                                        # max_depth=6,
                                        # shrinkage=0.325,
                                        # subsample=0.9,
                                        # num_candidate_attributes_ratio=0.375,
                                        ).train(preprocessed_train_df)

# Выведем характеристики полученной модели и саму модель в виде дерева
model.describe()
model.plot_tree()
print(model.print_tree())

tuned_self_evaluation = model.evaluate(preprocessed_train_df)
print(f"Accuracy: {tuned_self_evaluation.accuracy} Loss:{tuned_self_evaluation.loss}")

sub_path = os.path.join(os.path.abspath(os.getcwd()), "finetune_GBTrees.csv")
make_submission(prediction_to_kaggle_format(model.predict(preprocessed_val_df)), sub_path)
!head $sub_path

## Применение ансамбля классификаторов GradientBoostedTrees

In [None]:
import os

predictions = None
num_predictions = 0
ensemble_size = 150

# Применение ансамбля классификаторов
print(f"Train classifier ensemble of size {ensemble_size}")
for i in range(ensemble_size):
    print(f"Make prediction: {i}")

    # Здесь можно выбрать модели
    # GradientBoostedTreesModel / RandomForestModel / IsolationForestLearner
    # Модель на основе GradientBoostedTrees работает лучше остальных
    model = ydf.GradientBoostedTreesLearner(label="Survived", honest=True, random_seed=i).train(preprocessed_train_df)
    sub_predictions = model.predict(preprocessed_val_df)

    if predictions is None:
        predictions = sub_predictions
    else:
        predictions += sub_predictions
    num_predictions += 1

predictions/=num_predictions

kaggle_predictions = pd.DataFrame({
    "PassengerId": preprocessed_val_df["PassengerId"],
    "Survived": (predictions >= 0.5).astype(int)
})

sub_path = os.path.join(os.path.abspath(os.getcwd()), "ensemble_GBTrees_{}.csv".format(ensemble_size))
make_submission(kaggle_predictions, sub_path)
!head $sub_path

## Тестирование множества классификаторов и сравнение по метрикам Accuracy, R2 и F1

In [None]:

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, r2_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Попробуем применить другие модели классификаторов (не из пакета YDF)

# Закодируем категориальные данные как числовые для классификаторов, которые не умеют работать с категориями
# Получим список категориальных колонок
s = (preprocessed_train_df.dtypes == 'object')
object_cols = list(s[s].index)

# Кодируем категориальные данные как числовые
le = LabelEncoder()

encoded_train = preprocessed_train_df.copy()
encoded_val = preprocessed_val_df.copy()

for column in object_cols:
    print(f'Encoding categorical column {column}')
    encoded_train[column] = le.fit_transform(encoded_train[column].values)
    encoded_val[column] = le.fit_transform(encoded_val[column].values)

X = encoded_train[preprocessed_val_df.columns]
y = encoded_train['Survived']

# Для проверки качества обучения классификатора требуется выполнить разбиение данных на обучающую и тестовую выборки
# Так как для валидационной выборки kaggle отсутствует априорное знание Survived
raw_X_train, raw_X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Нормируем признаки для улучшения сходимости обучения классификаторов
# Также можно применять StandardScaler()
sc_X = MinMaxScaler()
cols = preprocessed_val_df.columns
normalized_train_arr = sc_X.fit_transform(raw_X_train)
normalized_test_arr = sc_X.fit_transform(raw_X_test)
normalized_val_arr = sc_X.transform(encoded_val[cols])

# 1. ydf.GradientBoostedTrees с подобранными параметрами
train_data_for_ydf = pd.DataFrame.from_records(normalized_train_arr, columns=preprocessed_val_df.columns)
train_data_for_ydf['Survived'] = y_train.to_numpy()
test_data_for_ydf = pd.DataFrame.from_records(normalized_test_arr, columns=preprocessed_val_df.columns)
test_data_for_ydf['Survived'] = y_test.to_numpy()

model = ydf.GradientBoostedTreesLearner(label="Survived",
                                        num_trees=100,
                                        max_depth=6,
                                        shrinkage=0.325,
                                        subsample=0.9,
                                        num_candidate_attributes_ratio=0.375).train(train_data_for_ydf)
ydf_single_predictions = model.predict(test_data_for_ydf)
y_pred_ydf_single = (ydf_single_predictions >= 0.5).astype(int)
ydf_gbt_single_acc = accuracy_score(y_test, y_pred_ydf_single)
ydf_gbt_single_r2 = r2_score(y_test, y_pred_ydf_single)
ydf_gbt_single_f1 = f1_score(y_test, y_pred_ydf_single)
print('YDF GBT single, fine tuned, classification report: \n{}'.format(classification_report(y_test, y_pred_ydf_single)))

# 2. Ансамбль из ydf.GradientBoostedTrees
ydf_ensemble_predictions = None
num_predictions = 0
ensemble_size = 100
for i in range(ensemble_size):
    model = ydf.GradientBoostedTreesLearner(label="Survived", honest=True, random_seed=i).train(train_data_for_ydf)
    sub_predictions = model.predict(test_data_for_ydf)

    if ydf_ensemble_predictions is None:
        ydf_ensemble_predictions = sub_predictions
    else:
        ydf_ensemble_predictions += sub_predictions
    num_predictions += 1

ydf_ensemble_predictions /= num_predictions

y_pred_ydf_ensemble = (ydf_ensemble_predictions >= 0.5).astype(int)
ydf_gbt_ensemble_acc = accuracy_score(y_test, y_pred_ydf_ensemble)
ydf_gbt_ensemble_r2 = r2_score(y_test, y_pred_ydf_ensemble)
ydf_gbt_ensemble_f1 = f1_score(y_test, y_pred_ydf_ensemble)
print('YDF GBT ensemble N={}, classification report: \n{}'.format(ensemble_size, classification_report(y_test, y_pred_ydf_ensemble)))

# 3. Логистическая регрессия
pipeline = Pipeline([('logisticregression', LogisticRegression(max_iter=100))])
param_grid = {'logisticregression__penalty' : ['l2'],
              'logisticregression__C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'logisticregression__solver' : ['liblinear']}

# GridSearchCV подбирает параметры модели
model = GridSearchCV(pipeline, param_grid, cv =None)
model.fit(normalized_train_arr, y_train)

y_pred_logistic_predictions = model.predict_proba(normalized_test_arr)[:,1]
y_pred_logistic = (y_pred_logistic_predictions >= 0.5).astype(int)
logistic_acc = accuracy_score(y_test, y_pred_logistic)
logistic_r2 = r2_score(y_test, y_pred_logistic)
logistic_f1 = f1_score(y_test, y_pred_logistic)
print('Logistic regression classification report: \n{}'.format(classification_report(y_test, y_pred_logistic)))


# 4. RandomForestClassifier из пакета sklearn
pipeline = Pipeline([('RandomForest', RandomForestClassifier())])
param_grid = {
  'RandomForest__min_samples_leaf': [1, 2, 4],
  'RandomForest__min_samples_split': [2, 3, 4],
  'RandomForest__n_estimators': [100, 200, 300]}

model = GridSearchCV(pipeline, param_grid, cv =None)
model.fit(normalized_train_arr, y_train)

y_pred_forest_predictions = model.predict_proba(normalized_test_arr)[:,1]
y_pred_forest = (y_pred_forest_predictions >= 0.5).astype(int)
forest_acc = accuracy_score(y_test, y_pred_forest)
forest_r2 = r2_score(y_test, y_pred_forest)
forest_f1 = f1_score(y_test, y_pred_forest)
print('RandomForest classification report: \n{}'.format(classification_report(y_test, y_pred_forest)))

# 5. XGBClassifier
model = XGBClassifier()
model.fit(normalized_train_arr, y_train)

y_pred_XGB_predictions = model.predict_proba(normalized_test_arr)[:,1]
y_pred_XGB = (y_pred_XGB_predictions >= 0.5).astype(int)
XGB_acc = accuracy_score(y_test, y_pred_XGB)
XGB_r2 = r2_score(y_test, y_pred_XGB)
XGB_f1 = f1_score(y_test, y_pred_XGB)
print('XGBClassifier classification report: \n{}'.format(classification_report(y_test, y_pred_XGB)))

# 6. KNN classifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(normalized_train_arr, y_train)

y_pred_KNN_predictions = model.predict_proba(normalized_test_arr)[:,1]
y_pred_KNN = (y_pred_KNN_predictions >= 0.5).astype(int)
KNN_acc = accuracy_score(y_test, y_pred_KNN)
KNN_r2 = r2_score(y_test, y_pred_KNN)
KNN_f1 = f1_score(y_test, y_pred_KNN)
print('KNN Classifier classification report: \n{}'.format(classification_report(y_test, y_pred_KNN)))

# 7. SVM (SVC) classifier, rbf kernel
model = SVC(probability=True)
model.fit(normalized_train_arr, y_train)

y_pred_SVC_predictions = model.predict_proba(normalized_test_arr)[:,1]
y_pred_SVC = (y_pred_SVC_predictions >= 0.5).astype(int)
SVC_acc = accuracy_score(y_test, y_pred_SVC)
SVC_r2 = r2_score(y_test, y_pred_SVC)
SVC_f1 = f1_score(y_test, y_pred_SVC)
print('SVC Classifier classification report: \n{}'.format(classification_report(y_test, y_pred_SVC)))

# 8. SVM (LinearSVC) classifier
model = SVC(kernel='linear',probability=True)
model.fit(normalized_train_arr, y_train)

y_pred_LSVC_predictions = model.predict_proba(normalized_test_arr)[:,1]
y_pred_LSVC = (y_pred_LSVC_predictions >= 0.5).astype(int)
LinearSVC_acc = accuracy_score(y_test, y_pred_LSVC)
LinearSVC_r2 = r2_score(y_test, y_pred_LSVC)
LinearSVC_f1 = f1_score(y_test, y_pred_LSVC)
print('LinearSVC Classifier classification report: \n{}'.format(classification_report(y_test, y_pred_LSVC)))

# 9. LGBM Classifier
model = LGBMClassifier()
model.fit(normalized_train_arr, y_train)

y_pred_LGBM_predictions = model.predict_proba(normalized_test_arr)[:,1]
y_pred_LGBM = (y_pred_LGBM_predictions >= 0.5).astype(int)
LGBM_acc = accuracy_score(y_test, y_pred_LGBM)
LGBM_r2 = r2_score(y_test, y_pred_LGBM)
LGBM_f1 = f1_score(y_test, y_pred_LGBM)
print('LGBM Classifier classification report: \n{}'.format(classification_report(y_test, y_pred_LGBM)))

# 10. Ансамбль из всех использованных классификаторов
multi_model_ensemble_predictions = (
    ydf_single_predictions + ydf_ensemble_predictions + y_pred_forest_predictions + \
    y_pred_logistic_predictions + y_pred_KNN_predictions + y_pred_SVC_predictions + \
    y_pred_LSVC_predictions + y_pred_XGB_predictions + y_pred_LGBM_predictions) / 9

multi_model_ensemble = (multi_model_ensemble_predictions >= 0.5).astype(int)
multi_model_ensemble_acc = accuracy_score(y_test, multi_model_ensemble)
multi_model_ensemble_r2 = r2_score(y_test, multi_model_ensemble)
multi_model_ensemble_f1 = f1_score(y_test, multi_model_ensemble)
print('Multi model ensemble classification report: \n{}'.format(classification_report(y_test, multi_model_ensemble)))

leaderboard_acc = {
    "ydf GBT finetune" : ydf_gbt_single_acc,
    "ydf GBT ensemble N={}".format(ensemble_size) : ydf_gbt_ensemble_acc,
    "Random Forest": forest_acc,
    "logistic regression": logistic_acc,
    "XGB": XGB_acc,
    "KNeighborsClassifier": KNN_acc,
    "SVC": SVC_acc,
    "Linear SVC": LinearSVC_acc,
    "lightgbm": LGBM_acc,
    "multi model ensemble" : multi_model_ensemble_acc
}

leaderboard_F1 = {
    "ydf GBT finetune" : ydf_gbt_single_f1,
    "ydf GBT ensemble N={}".format(ensemble_size) : ydf_gbt_ensemble_f1,
    "Random Forest": forest_f1,
    "logistic regression": logistic_f1,
    "XGB": XGB_f1,
    "KNeighborsClassifier": KNN_f1,
    "SVC": SVC_f1,
    "Linear SVC": LinearSVC_f1,
    "lightgbm": LGBM_f1,
    "multi model ensemble" : multi_model_ensemble_f1
}

leaderboard_r2 = {
    "ydf GBT finetune" : ydf_gbt_single_r2,
    "ydf GBT ensemble N={}".format(ensemble_size) : ydf_gbt_ensemble_r2,
    "Random Forest": forest_r2,
    "logistic regression": logistic_r2,
    "XGB": XGB_r2,
    "KNeighborsClassifier": KNN_r2,
    "SVC": SVC_r2,
    "Linear SVC": LinearSVC_r2,
    "lightgbm": LGBM_r2,
    "multi model ensemble" : multi_model_ensemble_r2
}

# Сравним классификаторы по критериям точности, F1 и R2
leaderboard_acc = dict(sorted(leaderboard_acc.items(), key=lambda item: item[1], reverse = True))
leaderboard_acc = pd.DataFrame.from_dict(leaderboard_acc, orient='index', columns=['Accuracy'])
print('\n----------------------------------------------------------------------------')
print(leaderboard_acc)

leaderboard_r2 = dict(sorted(leaderboard_r2.items(), key=lambda item: item[1], reverse = True))
leaderboard_r2 = pd.DataFrame.from_dict(leaderboard_r2, orient='index', columns=['R2'])
print('\n----------------------------------------------------------------------------')
print(leaderboard_r2)

leaderboard_F1 = dict(sorted(leaderboard_F1.items(), key=lambda item: item[1], reverse = True))
leaderboard_F1 = pd.DataFrame.from_dict(leaderboard_F1, orient='index', columns=['F1'])
print('\n----------------------------------------------------------------------------')
print(leaderboard_F1)


## Формирование submission на kaggle для всех протестированных классификаторов

In [None]:
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, r2_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Обучим все протестированные классификаторы на полных данных и сформируем submission на kaggle по каждому из них

# Закодируем категориальные данные как числовые для классификаторов, которые не умеют работать с категориями
# Получим список категориальных колонок
s = (preprocessed_train_df.dtypes == 'object')
object_cols = list(s[s].index)

# Кодируем категориальные данные как числовые
le = LabelEncoder()
encoded_train = preprocessed_train_df.copy()
encoded_val = preprocessed_val_df.copy()

for column in object_cols:
    print(f'Encoding categorical column {column}')
    encoded_train[column] = le.fit_transform(encoded_train[column].values)
    encoded_val[column] = le.fit_transform(encoded_val[column].values)

raw_X_train = encoded_train[preprocessed_val_df.columns]
y_train = encoded_train['Survived']

# Нормируем признаки для улучшения сходимости обучения классификаторов
# Также можно применять StandardScaler()
sc_X = MinMaxScaler()
cols = preprocessed_val_df.columns
normalized_train_arr = sc_X.fit_transform(raw_X_train)
normalized_val_arr = sc_X.transform(encoded_val[cols])

# 1. ydf.GradientBoostedTrees с подобранными параметрами
model = ydf.GradientBoostedTreesLearner(label="Survived",
                                        num_trees=100,
                                        max_depth=6,
                                        shrinkage=0.325,
                                        subsample=0.9,
                                        num_candidate_attributes_ratio=0.375).train(preprocessed_train_df)
ydf_single_predictions = model.predict(preprocessed_val_df)
sub_path = os.path.join(os.path.abspath(os.getcwd()), "ydf_GBT_tuned.csv")
make_submission(prediction_to_kaggle_format(ydf_single_predictions), sub_path)

# 2. Ансамбль из ydf.GradientBoostedTrees
ydf_ensemble_predictions = None
num_predictions = 0
ensemble_size = 100
for i in range(ensemble_size):
    model = ydf.GradientBoostedTreesLearner(label="Survived", honest=True, random_seed=i).train(preprocessed_train_df)
    sub_predictions = model.predict(preprocessed_val_df)

    if ydf_ensemble_predictions is None:
        ydf_ensemble_predictions = sub_predictions
    else:
        ydf_ensemble_predictions += sub_predictions
    num_predictions += 1

ydf_ensemble_predictions /= num_predictions
sub_path = os.path.join(os.path.abspath(os.getcwd()), "ydf_GBT_ensemble_N_{}.csv".format(ensemble_size))
make_submission(prediction_to_kaggle_format(ydf_ensemble_predictions), sub_path)

# 3. Логистическая регрессия
pipeline = Pipeline([('logisticregression', LogisticRegression(max_iter=100))])
param_grid = {'logisticregression__penalty' : ['l2'],
              'logisticregression__C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'logisticregression__solver' : ['liblinear']}

# GridSearchCV подбирает параметры модели
model = GridSearchCV(pipeline, param_grid, cv =None)
model.fit(normalized_train_arr, y_train)
y_pred_logistic_predictions = model.predict_proba(normalized_val_arr)[:,1]
sub_path = os.path.join(os.path.abspath(os.getcwd()), "logistic_regression.csv")
make_submission(prediction_to_kaggle_format(y_pred_logistic_predictions), sub_path)


# 4. RandomForestClassifier из пакета sklearn
pipeline = Pipeline([('RandomForest', RandomForestClassifier())])
param_grid = {
  'RandomForest__min_samples_leaf': [1, 2, 4],
  'RandomForest__min_samples_split': [2, 3, 4],
  'RandomForest__n_estimators': [100, 200, 300]}

model = GridSearchCV(pipeline, param_grid, cv =None)
model.fit(normalized_train_arr, y_train)
y_pred_forest_predictions = model.predict_proba(normalized_val_arr)[:,1]
sub_path = os.path.join(os.path.abspath(os.getcwd()), "RandomForestClassifier.csv")
make_submission(prediction_to_kaggle_format(y_pred_forest_predictions), sub_path)

# 5. XGBClassifier
model = XGBClassifier()
model.fit(normalized_train_arr, y_train)
y_pred_XGB_predictions = model.predict_proba(normalized_val_arr)[:,1]
sub_path = os.path.join(os.path.abspath(os.getcwd()), "XGB_Classifier.csv")
make_submission(prediction_to_kaggle_format(y_pred_XGB_predictions), sub_path)

# 6. KNN classifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(normalized_train_arr, y_train)
y_pred_KNN_predictions = model.predict_proba(normalized_val_arr)[:,1]
sub_path = os.path.join(os.path.abspath(os.getcwd()), "KNN_Classifier.csv")
make_submission(prediction_to_kaggle_format(y_pred_KNN_predictions), sub_path)

# 7. SVM (SVC) classifier, rbf kernel
model = SVC(probability=True)
model.fit(normalized_train_arr, y_train)
y_pred_SVC_predictions = model.predict_proba(normalized_val_arr)[:,1]
sub_path = os.path.join(os.path.abspath(os.getcwd()), "SVC_RBF_Classifier.csv")
make_submission(prediction_to_kaggle_format(y_pred_SVC_predictions), sub_path)

# 8. SVM (LinearSVC) classifier
model = SVC(kernel='linear',probability=True)
model.fit(normalized_train_arr, y_train)
y_pred_LSVC_predictions = model.predict_proba(normalized_val_arr)[:,1]
sub_path = os.path.join(os.path.abspath(os.getcwd()), "LinearSVC_Classifier.csv")
make_submission(prediction_to_kaggle_format(y_pred_LSVC_predictions), sub_path)

# 9. LGBM Classifier
model = LGBMClassifier()
model.fit(normalized_train_arr, y_train)
y_pred_LGBM_predictions = model.predict_proba(normalized_val_arr)[:,1]
sub_path = os.path.join(os.path.abspath(os.getcwd()), "LGBM_Classifier.csv")
make_submission(prediction_to_kaggle_format(y_pred_LGBM_predictions), sub_path)

# 10. Ансамбль из всех использованных классификаторов
multi_model_ensemble_predictions = (
    ydf_single_predictions + ydf_ensemble_predictions + y_pred_forest_predictions + \
    y_pred_logistic_predictions + y_pred_KNN_predictions + y_pred_SVC_predictions + \
    y_pred_LSVC_predictions + y_pred_XGB_predictions + y_pred_LGBM_predictions) / 9

sub_path = os.path.join(os.path.abspath(os.getcwd()), "multi_model_ensemble.csv")
make_submission(prediction_to_kaggle_format(multi_model_ensemble_predictions), sub_path)
