In [None]:
import pandas as pd
import numpy as np
import statistics
import ydf

from sklearn.preprocessing import LabelEncoder

print(f"Found YDF {ydf.__version__}")

# Загрузка данных
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

train_df.head(5)

def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    return np.nan


# Предобработка данных
# 1. Токенизация имён
# 2. Извлечение номера билета, если возможно
def preprocess(df):
    df = df.copy()

    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])

    def is_int(s):
        if s[0] in ('-', '+'):
            return s[1:].isdigit()
        return s.isdigit()

    def ticket_number(x):
        val = x.split(" ")[-1]
        return int(val) if is_int(val) else 0

    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])

    df["Name"] = df["Name"].apply(normalize_name)
    # df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    # df["Ticket_item"] = df["Ticket"].apply(ticket_item)
    return df


# Обработка данных, шаг 1
def phase1clean(df):
    # Заменяем нули на NaN
    df.Fare = df.Fare.map(lambda x: np.nan if x == 0 else x)
    #df.Fare = df.Fare.replace({ 512.3292 : 7.25})

    # Заменяем NaN каюты на "Неизвестно"
    df.Cabin = df.Cabin.fillna('Unknown')

    # Заменяем NaN цены билета на среднюю цену
    meanFare = np.mean(df.Fare)
    df.Fare = df.Fare.fillna(meanFare)

    # Заменяем NaN Age на средний Age
    meanAge = np.mean(df.Age)
    df.Age = df.Age.fillna(meanAge)

    # Заменяем NaN Embarked моду Embarked
    modeEmbarked = statistics.mode(df.Embarked)[0][0]
    df.Embarked = df.Embarked.fillna(modeEmbarked)

    # Создаём столбец "Титул" из "Имени"
    title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                'Dr', 'Ms', 'Mlle', 'Col', 'Capt', 'Mme', 'the Countess',
                'Dona', 'Don', 'Jonkheer', 'Lady']

    df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))

    normalized_titles = {
        "Capt":       "Officer",
        "Col":        "Officer",
        "Major":      "Officer",
        "Jonkheer":   "Royal",
        "Don":        "Royal",
        "Sir" :       "Royal",
        "Dr":         "Master",
        "Rev":        "Master",
        "the Countess":"Royal",
        "Dona":       "Royal",
        "Mme":        "Mrs",
        "Mlle":       "Miss",
        "Ms":         "Mrs",
        "Mr" :        "Mr",
        "Mrs" :       "Mrs",
        "Miss" :      "Miss",
        "Master" :    "Master",
        "Lady" :      "Royal"
    }

    # Ре-Мэппинг титулов
    df.Title = df.Title.map(normalized_titles)

    # Создаём столбец "Палуба" из "Каюты"
    #cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    #df['Deck'] = df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

    return df

# Обработка данных, шаг 2
def phase2clean(df):
    # Размер семью из косвенных полей
    # df['Family_Size'] = df['SibSp'] + df['Parch']

    # Цена билета
    # df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size']+1)

    # Возраст * class
    # df['Age*Class'] = df['Age'] * df['Pclass']

    # Выбрасываем лишние столбцы
    df = df.drop(['Ticket'], axis=1)
    #df = df.drop(['Ticket_item'], axis=1)
    #df = df.drop(['Cabin'], axis=1)

    le = LabelEncoder()
    df["Sex"] = le.fit_transform(df["Sex"].values)
    #df["Deck"] = le.fit_transform(df["Deck"].values)
    df["Embarked"] = le.fit_transform(df["Embarked"].values)
    df["Name"] = le.fit_transform(df["Name"].values)
    df["Title"] = le.fit_transform(df["Title"].values)

    return df

def clean_data(train_data, test_data):
    # Сомнительные пассажиры (nan в стоимости проезда)
    # train_data = train_data[(train_data.PassengerId != 259) & (train_data.PassengerId != 680) & (train_data.PassengerId != 738)]

    train_data = phase1clean(train_data)
    test_data = phase1clean(test_data)

    train_data = phase2clean(train_data)
    test_data = phase2clean(test_data)

    train_data.isna().sum()
    train_data.info()

    test_data.isna().sum()
    test_data.info()

    return [train_data, test_data]


# Выбираем столбцы признаков, которые будем использорвать для обучения
do_data_cleaning = True

preprocessed_train_df = preprocess(train_df)
preprocessed_test_df = preprocess(test_df)

# Дополнительно, фильтруем данные в датасете
if do_data_cleaning:
    preprocessed_train_df, preprocessed_test_df = clean_data(preprocessed_train_df, preprocessed_test_df)

preprocessed_train_df.head(800)


In [None]:
from sklearn.preprocessing import MinMaxScaler


# Нужно ли масштабировать признаки (не всегда применимо)
do_feature_scaling = False
if do_feature_scaling:
    sc_X = MinMaxScaler()
    preprocessed_train_df = sc_X.fit_transform(preprocessed_train_df)
    preprocessed_test_df = sc_X.transform(preprocessed_test_df)

In [None]:

tmpl = ydf.GradientBoostedTreesLearner.hyperparameter_templates()

tuner = ydf.RandomSearchTuner(num_trials=150)
tuner.choice("shrinkage", [0.325, 0.3, 0.275])
tuner.choice("subsample", [1.0, 0.95, 0.9])
tuner.choice("max_depth", [3, 4, 5, 6, 7])
tuner.choice("num_candidate_attributes_ratio", [0.425, 0.4, 0.375, 0.35])
tuner.choice("num_trees", [10, 50, 100, 300])

model = ydf.GradientBoostedTreesLearner(label="Survived",
                                        #num_trees=100,
                                        #tuner=tuner,
                                        #max_depth=6,
                                        #shrinkage=0.325,
                                        #subsample=0.9,
                                        #num_candidate_attributes_ratio=0.375,
                                        ).train(preprocessed_train_df)
model.describe()
tuned_self_evaluation = model.evaluate(preprocessed_train_df)
print(f"Accuracy: {tuned_self_evaluation.accuracy} Loss:{tuned_self_evaluation.loss}")


In [None]:
import os

def prediction_to_kaggle_format(model, threshold=0.5):
    proba_survive = model.predict(preprocessed_test_df)
    return pd.DataFrame({
        "PassengerId": preprocessed_test_df["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })

def make_submission(kaggle_predictions, sub_path):
    path=sub_path
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")

kaggle_predictions = prediction_to_kaggle_format(model)

sub_path = os.path.join(os.path.abspath(os.getcwd()), "submission_def.csv")

make_submission(kaggle_predictions, sub_path)
!head $sub_path


In [None]:
import os

predictions = None
num_predictions = 0
ensemble_size = 100

# Применение ансамбля классификаторов
for i in range(ensemble_size):
    print(f"Make prediction: {i}")

    # GradientBoostedTreesModel / RandomForestModel / IsolationForestLearner
    model = ydf.GradientBoostedTreesLearner(label="Survived", honest=True, random_seed=i).train(preprocessed_train_df)
    sub_predictions = model.predict(preprocessed_test_df)

    if predictions is None:
        predictions = sub_predictions
    else:
        predictions += sub_predictions
    num_predictions += 1

predictions/=num_predictions

kaggle_predictions = pd.DataFrame({
    "PassengerId": preprocessed_test_df["PassengerId"],
    "Survived": (predictions >= 0.5).astype(int)
})

sub_path = os.path.join(os.path.abspath(os.getcwd()), "ensemble_GBTL_100_ft_clean_v3.csv")
make_submission(kaggle_predictions, sub_path)
!head $sub_path