# Подключение библиотек и загрузка данных

In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import requests
from typing import List
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

EXPERIMENTS = 100

In [None]:
data_url = requests.get('https://raw.githubusercontent.com/sikoraaxd/Homework/main/datasets/titanic_train.csv')
with open('data.csv', 'w') as f:
  f.write(data_url.content.decode('utf-8'))

data = pd.read_csv('data.csv')
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

# Предобработка данных: избавление от отсуствующих значений

In [None]:
columns_with_null = data.columns[data.isnull().any()].tolist()
print('Столбцы, в которых отсутствуют значения', columns_with_null)
for column in columns_with_null:
  mean_value = data[column].mean() if data[column].dtype != object \
                                   else data[column].value_counts().idxmax()
  data[column] = data[column].fillna(mean_value)

data

Столбцы, в которых отсутствуют значения ['Age', 'Cabin', 'Embarked']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,B96 B98,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,B96 B98,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


# Предобработка данных: извлечение фактов из столбца "Name"

In [None]:
titles = set()
for name in data['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())
print(titles)

{'Sir', 'Mrs', 'Capt', 'Miss', 'Don', 'Mme', 'Lady', 'Mlle', 'Dr', 'Mr', 'the Countess', 'Master', 'Ms', 'Col', 'Major', 'Jonkheer', 'Rev'}


In [None]:
data['Title'] = data['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,B96 B98,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,B96 B98,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,B96 B98,S,Rev
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,B96 B98,S,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C,Mr


**Категориальные данные**

In [None]:
categorical = data[['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Title']]
categorical

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Title
0,3,male,1,0,S,Mr
1,1,female,1,0,C,Mrs
2,3,female,0,0,S,Miss
3,1,female,1,0,S,Mrs
4,3,male,0,0,S,Mr
...,...,...,...,...,...,...
886,2,male,0,0,S,Rev
887,1,female,0,0,S,Miss
888,3,female,1,2,S,Miss
889,1,male,0,0,C,Mr


**Числовые данные**

In [None]:
numerical = data[['Age', 'Fare']]
numerical

Unnamed: 0,Age,Fare
0,22.000000,7.2500
1,38.000000,71.2833
2,26.000000,7.9250
3,35.000000,53.1000
4,35.000000,8.0500
...,...,...
886,27.000000,13.0000
887,19.000000,30.0000
888,29.699118,23.4500
889,26.000000,30.0000


# Функции One-Hot Encoding и softmax

In [None]:
def OneHotEncoding(df: pd.DataFrame, column_names: List[str]) -> pd.DataFrame:
  for column_name in column_names:
    column = df[column_name]
    unique_values = column.unique()
    n_values = len(unique_values)
    one_hot_encoded = np.zeros((len(column), n_values))
    
    for i, value in enumerate(unique_values):
        one_hot_encoded[:, i] = column == value
    
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=[f"{column_name}_{value}" for value in unique_values])
    
    df = pd.concat([df, one_hot_encoded_df], axis=1).drop(column_name, axis=1)
    df.drop_duplicates()
  return df


def Softmax(data):
  return np.exp(data) / np.sum(np.exp(data), axis=1, keepdims=True)

# Предобработка данных: кодирование категориальных столбцов методом One-Hot Encoding

In [None]:
data_encoded = OneHotEncoding(
                  df = data, 
                  column_names = ['Sex', 'Embarked', 'Title']
               )
data_encoded

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title_Mme,Title_Ms,Title_Major,Title_Lady,Title_Sir,Title_Mlle,Title_Col,Title_Capt,Title_the Countess,Title_Jonkheer
0,1,0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.000000,0,0,211536,13.0000,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,888,1,1,"Graham, Miss. Margaret Edith",19.000000,0,0,112053,30.0000,B42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",29.699118,1,2,W./C. 6607,23.4500,B96 B98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,890,1,1,"Behr, Mr. Karl Howell",26.000000,0,0,111369,30.0000,C148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Предобработка данных: избавление от столбов со строковыми данными и разделение выборки на fetures и target

In [None]:
exclude_columns = ['Survived', 'Name', 'Ticket', 'Cabin']
X_columns = data_encoded.columns.difference(exclude_columns)
print('Features для обучения:\n', X_columns.tolist()) 

X, y = data_encoded[X_columns], data['Survived']

Features для обучения:
 ['Age', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare', 'Parch', 'PassengerId', 'Pclass', 'Sex_female', 'Sex_male', 'SibSp', 'Title_Capt', 'Title_Col', 'Title_Don', 'Title_Dr', 'Title_Jonkheer', 'Title_Lady', 'Title_Major', 'Title_Master', 'Title_Miss', 'Title_Mlle', 'Title_Mme', 'Title_Mr', 'Title_Mrs', 'Title_Ms', 'Title_Rev', 'Title_Sir', 'Title_the Countess']


In [None]:
def data_split(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.6, 
                                                    shuffle=True)
  return X_train, X_test, y_train, y_test

# Не предобработанные данные

In [None]:
non_scaled_accuracy = []

In [None]:
for i in range(EXPERIMENTS):
  X_train, X_test, y_train, y_test = data_split(X, y)

  model1 = LogisticRegression()
  model1.fit(X_train, y_train)
  y_logits = model1.predict_log_proba(X_test)
  predicts = np.round(Softmax(y_logits), 2).argmax(axis=1)
  accuracy = accuracy_score(predicts, y_test)
  non_scaled_accuracy.append(accuracy)

Предобработанные данные

# Предобработанные данные

In [None]:
pipeline = Pipeline([
  ('scaling', StandardScaler()),
  ('normalizing', MinMaxScaler())
])

X = pipeline.fit_transform(X)

In [None]:
scaled_accuracy = []

In [None]:
for i in range(EXPERIMENTS):
  X_train, X_test, y_train, y_test = data_split(X, y)

  model2 = LogisticRegression()
  model2.fit(X_train, y_train)
  y_logits = model2.predict_log_proba(X_test)
  predicts = np.round(Softmax(y_logits), 2).argmax(axis=1)
  accuracy = accuracy_score(predicts, y_test)
  scaled_accuracy.append(accuracy)

# Числовые данные

In [None]:
X = numerical.values

In [None]:
numeric_accuracy = []

In [None]:
for i in range(EXPERIMENTS):
  X_train, X_test, y_train, y_test = data_split(X, y)

  model3 = LogisticRegression()
  model3.fit(X_train, y_train)
  y_logits = model3.predict_log_proba(X_test)
  predicts = np.round(Softmax(y_logits), 2).argmax(axis=1)
  accuracy = accuracy_score(predicts, y_test)
  numeric_accuracy.append(accuracy)

# Результаты

In [None]:
data = {
    'Не предобработанные': non_scaled_accuracy,
    'Обработанные': scaled_accuracy,
    'Числовые': numeric_accuracy,
    'x': np.arange(100),
}

results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Не предобработанные,Обработанные,Числовые,x
0,0.778711,0.854342,0.635854,0
1,0.834734,0.817927,0.661064,1
2,0.831933,0.820728,0.647059,2
3,0.795518,0.834734,0.672269,3
4,0.829132,0.837535,0.619048,4
...,...,...,...,...
95,0.823529,0.789916,0.672269,95
96,0.795518,0.806723,0.635854,96
97,0.817927,0.795518,0.672269,97
98,0.798319,0.809524,0.658263,98


In [None]:
fig = go.Figure()

for col in results_df.columns:
    if col != 'x':
        fig.add_trace(go.Scatter(x=results_df['x'], y=results_df[col], name=col))

fig.update_layout(
    xaxis_title='x',
    yaxis_title='accuracy'
)

fig.show()