In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('../data/titanic_train.csv')
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
data = data.set_index('PassengerId')
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
target = 'Survived'
y = data[target]
x = data.drop(target, axis=1)
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 76.6+ KB


## Изучение качества данных и их очистка

In [6]:
# Name и Ticket не рассматриваются
# Cabin имеет много пропущенных значений
x = x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
x

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,male,22.0,1,0,7.2500,S
2,1,female,38.0,1,0,71.2833,C
3,3,female,26.0,0,0,7.9250,S
4,1,female,35.0,1,0,53.1000,S
5,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
887,2,male,27.0,0,0,13.0000,S
888,1,female,19.0,0,0,30.0000,S
889,3,female,,1,2,23.4500,S
890,1,male,26.0,0,0,30.0000,C


In [7]:
# В поле возрас есть пропущенные значения - заменим на средний
mean_age = x['Age'].mean()
mean_age

29.69911764705882

In [8]:
x['Age'] = x['Age'].fillna(mean_age)
x['Age'].unique()

array([22.        , 38.        , 26.        , 35.        , 29.69911765,
       54.        ,  2.        , 27.        , 14.        ,  4.        ,
       58.        , 20.        , 39.        , 55.        , 31.        ,
       34.        , 15.        , 28.        ,  8.        , 19.        ,
       40.        , 66.        , 42.        , 21.        , 18.        ,
        3.        ,  7.        , 49.        , 29.        , 65.        ,
       28.5       ,  5.        , 11.        , 45.        , 17.        ,
       32.        , 16.        , 25.        ,  0.83      , 30.        ,
       33.        , 23.        , 24.        , 46.        , 59.        ,
       71.        , 37.        , 47.        , 14.5       , 70.5       ,
       32.5       , 12.        ,  9.        , 36.5       , 51.        ,
       55.5       , 40.5       , 44.        ,  1.        , 61.        ,
       56.        , 50.        , 36.        , 45.5       , 20.5       ,
       62.        , 41.        , 52.        , 63.        , 23.5 

In [9]:
# Embarked - не хватает двух категориальных значений
# заменим на моду
x['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [10]:
embarked_mode = x['Embarked'].mode()[0]
embarked_mode

'S'

In [11]:
x['Embarked'] = x['Embarked'].fillna(embarked_mode)
x['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [12]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 55.7+ KB


## Работа с категорями

In [13]:
# Пол
x['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [14]:
x['Sex'] = (x['Sex'] == 'female').astype(int)
x['Sex'].value_counts()

0    577
1    314
Name: Sex, dtype: int64

In [15]:
# Embarked
x = pd.get_dummies(x)
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      891 non-null    int64  
 1   Sex         891 non-null    int32  
 2   Age         891 non-null    float64
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   Embarked_C  891 non-null    uint8  
 7   Embarked_Q  891 non-null    uint8  
 8   Embarked_S  891 non-null    uint8  
dtypes: float64(2), int32(1), int64(3), uint8(3)
memory usage: 47.9 KB


In [16]:
x.head(10)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,0,22.0,1,0,7.25,0,0,1
2,1,1,38.0,1,0,71.2833,1,0,0
3,3,1,26.0,0,0,7.925,0,0,1
4,1,1,35.0,1,0,53.1,0,0,1
5,3,0,35.0,0,0,8.05,0,0,1
6,3,0,29.699118,0,0,8.4583,0,1,0
7,1,0,54.0,0,0,51.8625,0,0,1
8,3,0,2.0,3,1,21.075,0,0,1
9,3,1,27.0,0,2,11.1333,0,0,1
10,2,1,14.0,1,0,30.0708,1,0,0


In [17]:
# PClass
x['Pclass'].unique()

array([3, 1, 2], dtype=int64)

In [18]:
x['Pclass'] = x['Pclass'].astype('category')
x = pd.get_dummies(x)
x.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Pclass_1', 'Pclass_2', 'Pclass_3'],
      dtype='object')

In [19]:
x.head(10)

Unnamed: 0_level_0,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,22.0,1,0,7.25,0,0,1,0,0,1
2,1,38.0,1,0,71.2833,1,0,0,1,0,0
3,1,26.0,0,0,7.925,0,0,1,0,0,1
4,1,35.0,1,0,53.1,0,0,1,1,0,0
5,0,35.0,0,0,8.05,0,0,1,0,0,1
6,0,29.699118,0,0,8.4583,0,1,0,0,0,1
7,0,54.0,0,0,51.8625,0,0,1,1,0,0
8,0,2.0,3,1,21.075,0,0,1,0,0,1
9,1,27.0,0,2,11.1333,0,0,1,0,0,1
10,1,14.0,1,0,30.0708,1,0,0,0,1,0


## Разбиение на тренировочный, валидационный и тестовый датасеты

In [20]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.25, random_state=42)

## Построение модели

In [21]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 299 to 103
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Sex         668 non-null    int32  
 1   Age         668 non-null    float64
 2   SibSp       668 non-null    int64  
 3   Parch       668 non-null    int64  
 4   Fare        668 non-null    float64
 5   Embarked_C  668 non-null    uint8  
 6   Embarked_Q  668 non-null    uint8  
 7   Embarked_S  668 non-null    uint8  
 8   Pclass_1    668 non-null    uint8  
 9   Pclass_2    668 non-null    uint8  
 10  Pclass_3    668 non-null    uint8  
dtypes: float64(2), int32(1), int64(2), uint8(6)
memory usage: 32.6 KB


In [22]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression()

In [23]:
y_pred = lr.predict(x_valid)
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0], dtype=int64)

## Метрика - доля правильных ответов

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, y_pred)

0.8026905829596412

## Вычисление вероятности событий

In [25]:
# В первом столбце вероятность события 0, вотором - 1 
y_proba = lr.predict_proba(x_valid)
y_proba

array([[0.88803847, 0.11196153],
       [0.7306994 , 0.2693006 ],
       [0.87015651, 0.12984349],
       [0.08991937, 0.91008063],
       [0.25062015, 0.74937985],
       [0.07826308, 0.92173692],
       [0.33446957, 0.66553043],
       [0.9052244 , 0.0947756 ],
       [0.24542473, 0.75457527],
       [0.10340605, 0.89659395],
       [0.69433675, 0.30566325],
       [0.93487212, 0.06512788],
       [0.62484707, 0.37515293],
       [0.84761359, 0.15238641],
       [0.75859308, 0.24140692],
       [0.07898047, 0.92101953],
       [0.72748705, 0.27251295],
       [0.3343849 , 0.6656151 ],
       [0.70247546, 0.29752454],
       [0.70651163, 0.29348837],
       [0.88386345, 0.11613655],
       [0.64286509, 0.35713491],
       [0.39821922, 0.60178078],
       [0.8694494 , 0.1305506 ],
       [0.89833287, 0.10166713],
       [0.92821101, 0.07178899],
       [0.56597441, 0.43402559],
       [0.72832999, 0.27167001],
       [0.91442869, 0.08557131],
       [0.42638414, 0.57361586],
       [0.

In [26]:
## Сохранение данных

In [27]:
x_train.to_pickle('../data/titanic_x_train.pkl')
y_train.to_pickle('../data/titanic_y_train.pkl')

x_valid.to_pickle('../data/titanic_x_valid.pkl')
y_valid.to_pickle('../data/titanic_y_valid.pkl')