In [102]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [91]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=['Survived']), train_df['Survived'], test_size=0.2, random_state=42)

In [92]:
X_train = X_train.drop(columns=['PassengerId', 'Ticket', 'Cabin', 'Embarked'])
X_test = X_test.drop(columns=['PassengerId', 'Ticket', 'Cabin', 'Embarked'])

X_train['Name'] = X_train['Name'].apply(lambda x: x.split(', ')[1].split('.')[0])
X_test['Name'] = X_test['Name'].apply(lambda x: x.split(', ')[1].split('.')[0])

In [93]:
def categorize_titles(row):
    if pd.notna(row['Age']) and row['Age'] < 18:
        return 2
    match row['Name']:
        case 'Mr' | 'Don' | 'Rev' | 'Dr' | 'Major' | 'Sir' | 'Col' | 'Capt' | 'Jonkheer':
            return 0
        case 'Mrs' | 'Miss' | 'Mme' | 'Ms' | 'Lady' | 'Mlle' | 'the Countess' | 'Dona':
            return 1
        case 'Master':
            return 2
        case _:
            return None


X_train['Category'] = X_train.apply(categorize_titles, axis=1)
X_test['Category'] = X_test.apply(categorize_titles, axis=1)

mean_ages_train = X_train.groupby('Category')['Age'].mean()
mean_ages_test = X_test.groupby('Category')['Age'].mean()

# Функция для замены пропущенных значений возраста
def fill_missing_age(row, mean_ages):
    if pd.isna(row['Age']):
        return mean_ages[row['Category']]
    return row['Age']

# Применяем функцию для заполнения пропущенных значений возраста
X_train['Age'] = X_train.apply(lambda x: fill_missing_age(x, mean_ages_train), axis=1)
X_test['Age'] = X_test.apply(lambda x: fill_missing_age(x, mean_ages_test), axis=1)

X_train = X_train.drop(columns=['Name', 'Sex', 'SibSp', 'Parch', 'Fare'])
X_test = X_test.drop(columns=['Name', 'Sex', 'SibSp', 'Parch', 'Fare'])
X_train

Unnamed: 0,Pclass,Age,Category
331,1,45.500000,0
733,2,23.000000,0
382,3,32.000000,0
704,3,26.000000,0
813,3,6.000000,2
...,...,...,...
106,3,21.000000,1
270,1,33.864353,0
860,3,41.000000,0
435,1,14.000000,2


In [94]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [104]:
# Создаем и обучаем модель
logreg = LogisticRegression(C=0.01, l1_ratio=0.5, max_iter=100, penalty='l2', solver='liblinear')
logreg.fit(X_train, y_train)

# Предсказываем и оцениваем модель
y_pred_logreg = logreg.predict(X_test)
print(f'Accuracy (Logistic Regression): {accuracy_score(y_test, y_pred_logreg):.2f}')

Accuracy (Logistic Regression): 0.77


In [106]:
# Создаем и обучаем модель
tree = DecisionTreeClassifier(criterion='entropy', max_depth=50, max_features='log2', min_samples_leaf=4, min_samples_split=10, splitter='best')
tree.fit(X_train, y_train)

# Предсказываем и оцениваем модель
y_pred_tree = tree.predict(X_test)
print(f'Accuracy (Decision Tree): {accuracy_score(y_test, y_pred_tree):.2f}')

Accuracy (Decision Tree): 0.80


In [97]:
# Создаем и обучаем модель
forest = RandomForestClassifier(n_estimators=, criterion='', max_depth=, min_samples_split=, min_samples_leaf=, max_features='', bootstrap=)
forest.fit(X_train, y_train)

# Предсказываем и оцениваем модель
y_pred_forest = forest.predict(X_test)
print(f'Accuracy (Random Forest): {accuracy_score(y_test, y_pred_forest):.2f}')

Accuracy (Random Forest): 0.78
