# Подключение библиотек и загрузка данных

In [2]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import requests
from typing import List
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

EXPERIMENTS = 100

In [3]:
data_url = requests.get('https://raw.githubusercontent.com/sikoraaxd/Homework/main/datasets/titanic_train.csv')
with open('data.csv', 'w') as f:
  f.write(data_url.content.decode('utf-8'))

data = pd.read_csv('data.csv')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

# Предобработка данных: избавление от отсуствующих значений

In [5]:
columns_with_null = data.columns[data.isnull().any()].tolist()
print('Столбцы, в которых отсутствуют значения', columns_with_null)
for column in columns_with_null:
  mean_value = data[column].mean() if data[column].dtype != object \
                                   else data[column].value_counts().idxmax()
  data[column] = data[column].fillna(mean_value)

data

Столбцы, в которых отсутствуют значения ['Age', 'Cabin', 'Embarked']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,B96 B98,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,B96 B98,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


# Предобработка данных: извлечение фактов из столбца "Name"

In [6]:
titles = set()
for name in data['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())
print(titles)

{'Dr', 'Mlle', 'Mme', 'Col', 'Miss', 'Ms', 'Major', 'Don', 'Rev', 'Sir', 'Mrs', 'Jonkheer', 'Lady', 'Mr', 'the Countess', 'Capt', 'Master'}


In [7]:
data['Title'] = data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,B96 B98,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,B96 B98,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,B96 B98,S,Rev
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,B96 B98,S,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C,Mr


**Категориальные данные**

In [8]:
categorical = data[['Pclass', 'Sex', 'Embarked', 'Title']]
categorical

Unnamed: 0,Pclass,Sex,Embarked,Title
0,3,male,S,Mr
1,1,female,C,Mrs
2,3,female,S,Miss
3,1,female,S,Mrs
4,3,male,S,Mr
...,...,...,...,...
886,2,male,S,Rev
887,1,female,S,Miss
888,3,female,S,Miss
889,1,male,C,Mr


**Числовые данные**

In [9]:
numerical = data[['Age', 'Fare', 'SibSp', 'Parch']]
numerical

Unnamed: 0,Age,Fare,SibSp,Parch
0,22.000000,7.2500,1,0
1,38.000000,71.2833,1,0
2,26.000000,7.9250,0,0
3,35.000000,53.1000,1,0
4,35.000000,8.0500,0,0
...,...,...,...,...
886,27.000000,13.0000,0,0
887,19.000000,30.0000,0,0
888,29.699118,23.4500,1,2
889,26.000000,30.0000,0,0


# Функции One-Hot Encoding и softmax

In [10]:
def OneHotEncoding(df: pd.DataFrame, column_names: List[str]) -> pd.DataFrame:
  for column_name in column_names:
    column = df[column_name]
    unique_values = column.unique()
    n_values = len(unique_values)
    one_hot_encoded = np.zeros((len(column), n_values))
    
    for i, value in enumerate(unique_values):
        one_hot_encoded[:, i] = column == value
    
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=[f"{column_name}_{value}" for value in unique_values])
    
    df = pd.concat([df, one_hot_encoded_df], axis=1).drop(column_name, axis=1)
    df.drop_duplicates()
  return df


def Softmax(data):
  return np.exp(data) / np.sum(np.exp(data), axis=1, keepdims=True)

# Предобработка данных: кодирование категориальных столбцов методом One-Hot Encoding

In [11]:
data_encoded = OneHotEncoding(
                  df = data, 
                  column_names = ['Sex', 'Embarked', 'Title']
               )
data_encoded

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title_Mme,Title_Ms,Title_Major,Title_Lady,Title_Sir,Title_Mlle,Title_Col,Title_Capt,Title_the Countess,Title_Jonkheer
0,1,0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.000000,0,0,211536,13.0000,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,888,1,1,"Graham, Miss. Margaret Edith",19.000000,0,0,112053,30.0000,B42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",29.699118,1,2,W./C. 6607,23.4500,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,890,1,1,"Behr, Mr. Karl Howell",26.000000,0,0,111369,30.0000,C148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Предобработка данных: избавление от столбов со строковыми данными и разделение выборки на fetures и target

In [12]:
exclude_columns = ['Survived', 'Name', 'Ticket', 'Cabin']
X_columns = data_encoded.columns.difference(exclude_columns)
print('Features для обучения:\n', X_columns.tolist()) 

X, y = data_encoded[X_columns], data['Survived']

Features для обучения:
 ['Age', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare', 'Parch', 'PassengerId', 'Pclass', 'Sex_female', 'Sex_male', 'SibSp', 'Title_Capt', 'Title_Col', 'Title_Don', 'Title_Dr', 'Title_Jonkheer', 'Title_Lady', 'Title_Major', 'Title_Master', 'Title_Miss', 'Title_Mlle', 'Title_Mme', 'Title_Mr', 'Title_Mrs', 'Title_Ms', 'Title_Rev', 'Title_Sir', 'Title_the Countess']


In [13]:
def data_split(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.6, 
                                                    shuffle=True)
  return X_train, X_test, y_train, y_test

# Не предобработанные данные

In [14]:
non_scaled_accuracy = []

In [15]:
for i in range(EXPERIMENTS):
  X_train, X_test, y_train, y_test = data_split(X, y)

  model1 = LogisticRegression()
  model1.fit(X_train, y_train)
  y_logits = model1.predict_log_proba(X_test)
  predicts = np.round(Softmax(y_logits), 2).argmax(axis=1)
  accuracy = accuracy_score(predicts, y_test)
  non_scaled_accuracy.append(accuracy)

Предобработанные данные

# Предобработанные данные

In [16]:
pipeline = Pipeline([
  ('scaling', StandardScaler()),
  ('normalizing', MinMaxScaler())
])

X = pipeline.fit_transform(X)

In [17]:
scaled_accuracy = []

In [18]:
for i in range(EXPERIMENTS):
  X_train, X_test, y_train, y_test = data_split(X, y)

  model2 = LogisticRegression()
  model2.fit(X_train, y_train)
  y_logits = model2.predict_log_proba(X_test)
  predicts = np.round(Softmax(y_logits), 2).argmax(axis=1)
  accuracy = accuracy_score(predicts, y_test)
  scaled_accuracy.append(accuracy)

# Числовые данные

In [19]:
X = numerical.values

In [20]:
numeric_accuracy = []

In [21]:
for i in range(EXPERIMENTS):
  X_train, X_test, y_train, y_test = data_split(X, y)

  model3 = LogisticRegression()
  model3.fit(X_train, y_train)
  y_logits = model3.predict_log_proba(X_test)
  predicts = np.round(Softmax(y_logits), 2).argmax(axis=1)
  accuracy = accuracy_score(predicts, y_test)
  numeric_accuracy.append(accuracy)

# Результаты

In [22]:
data = {
    'Не предобработанные': non_scaled_accuracy,
    'Обработанные': scaled_accuracy,
    'Числовые': numeric_accuracy,
    'x': np.arange(100),
}

results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Не предобработанные,Обработанные,Числовые,x
0,0.829132,0.851541,0.694678,0
1,0.817927,0.843137,0.635854,1
2,0.834734,0.798319,0.697479,2
3,0.820728,0.809524,0.677871,3
4,0.798319,0.820728,0.669468,4
...,...,...,...,...
95,0.817927,0.834734,0.711485,95
96,0.823529,0.837535,0.700280,96
97,0.801120,0.857143,0.669468,97
98,0.801120,0.823529,0.700280,98


In [23]:
fig = go.Figure()

for col in results_df.columns:
    if col != 'x':
        fig.add_trace(go.Scatter(x=results_df['x'], y=results_df[col], name=col))

fig.update_layout(
    xaxis_title='x',
    yaxis_title='accuracy'
)

fig.show()