## Загрузка данных и предобработка

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression  # Добавлен этот импорт
from sklearn.metrics import f1_score, classification_report
from sklearn.dummy import DummyClassifier

df = pd.read_csv('train.csv')

df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Выбор метрики качества
Метрика: F1-score
Обоснование:

Данные несбалансированы (61% погибших, 39% выживших).

F1-score учитывает и precision и recall, что важно для задач с дисбалансом классов.

Альтернативно можно использовать ROC-AUC, но F1 интерпретируем лучше для бинарной классификации.

## Бейзлайн (константное предсказание)

In [6]:
baseline = DummyClassifier(strategy='most_frequent', random_state=42)
baseline.fit(X_train, y_train)
y_pred_baseline = baseline.predict(X_test)

print("F1-score бейзлайна:", f1_score(y_test, y_pred_baseline))

F1-score бейзлайна: 0.0


## ML-модель (Logistic Regression)

In [7]:
numeric_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

In [8]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("F1-score модели:", f1_score(y_test, y_pred))

F1-score модели: 0.7375886524822695


In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       105
           1       0.78      0.70      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

