In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets
from sklearn.metrics import classification_report, confusion_matrix, r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
import os
import sys
sys.path.append(os.path.abspath('../modules'))
from MLP import MLP

# Датасет «Titanic»

## Подготовка данных

Импортируем датасет.

In [3]:
titanic_df = pd.read_csv('../data/titanic.csv')

In [4]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Заполняем пропуски средними значениями.

In [6]:
titanic_df = titanic_df.fillna(titanic_df.mean())

Отбрасываем ненужные признаки.

In [7]:
titanic_df = titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

Перекодируем признаки.

In [8]:
mapping = {'male': 0,
           'female': 1 }

titanic_df['Sex'] = titanic_df['Sex'].map(mapping)
titanic_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.000000,1,0,7.2500,S
1,1,1,1,38.000000,1,0,71.2833,C
2,1,3,1,26.000000,0,0,7.9250,S
3,1,1,1,35.000000,1,0,53.1000,S
4,0,3,0,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,13.0000,S
887,1,1,1,19.000000,0,0,30.0000,S
888,0,3,1,29.699118,1,2,23.4500,S
889,1,1,0,26.000000,0,0,30.0000,C


In [9]:
titanic_df = pd.get_dummies(titanic_df, prefix=['c'], columns=['Embarked'])
titanic_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,c_C,c_Q,c_S
0,0,3,0,22.000000,1,0,7.2500,0,0,1
1,1,1,1,38.000000,1,0,71.2833,1,0,0
2,1,3,1,26.000000,0,0,7.9250,0,0,1
3,1,1,1,35.000000,1,0,53.1000,0,0,1
4,0,3,0,35.000000,0,0,8.0500,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,13.0000,0,0,1
887,1,1,1,19.000000,0,0,30.0000,0,0,1
888,0,3,1,29.699118,1,2,23.4500,0,0,1
889,1,1,0,26.000000,0,0,30.0000,1,0,0


In [10]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   c_C       891 non-null    uint8  
 8   c_Q       891 non-null    uint8  
 9   c_S       891 non-null    uint8  
dtypes: float64(2), int64(5), uint8(3)
memory usage: 51.5 KB


In [11]:
X = titanic_df.iloc[:, 1:10].values

In [12]:
y = titanic_df.iloc[:, 0].values

Разбиваем исходные данные на обучающую и контрольную выборки. Обучающая выборка составляет 80% от объема исходных данных, а контрольная составляет, соответственно, 20%.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Объявляем *StandardScaler* для нормализации данных.

In [14]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Классификация с помощью модели MLP

In [15]:
mlp = MLP(X_train.shape[1], len(np.unique(titanic_df)))
mlp.fit(X_train, y_train)

In [16]:
y_pred = []
for i in range(len(X_test)):
    y_pred.append(np.argmax(mlp.predict(X_test[i])))

In [17]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[86 18]
 [32 43]]
              precision    recall  f1-score   support

           0       0.73      0.83      0.77       104
           1       0.70      0.57      0.63        75

    accuracy                           0.72       179
   macro avg       0.72      0.70      0.70       179
weighted avg       0.72      0.72      0.72       179



# Датасет «Iris»

## Подготовка данных

Импортируем датасет.

In [18]:
iris_df = datasets.load_iris()

In [19]:
X = iris_df.data
y = iris_df.target

Разбиваем исходные данные на обучающую и контрольную выборки. Обучающая выборка составляет 80% от объема исходных данных, а контрольная составляет, соответственно, 20%.

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Объявляем *StandardScaler* для нормализации данных.

In [21]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Классификация с помощью модели MLP

In [22]:
mlp = MLP(X_train.shape[1], len(np.unique(iris_df.target)))
mlp.fit(X_train, y_train)

In [23]:
y_pred = []
for i in range(len(X_test)):
    y_pred.append(np.argmax(mlp.predict(X_test[i])))

In [24]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[12  0  0]
 [ 0 11  1]
 [ 0  2  4]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.85      0.92      0.88        12
           2       0.80      0.67      0.73         6

    accuracy                           0.90        30
   macro avg       0.88      0.86      0.87        30
weighted avg       0.90      0.90      0.90        30



# Датасет «Balance Scale»

## Подготовка данных 

Импортируем датасет.

In [25]:
balance_scale_df = pd.read_csv('../data/balance_scale_preprocessed.csv')
balance_scale_df.head()

Unnamed: 0,Class_Name,Left-Weight,Left-Distance,Right-Weight,Right-Distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [26]:
balance_scale_df['Class_Name'] = LabelEncoder().fit_transform(balance_scale_df['Class_Name'].tolist())

In [27]:
X = balance_scale_df.iloc[:, 1:5].values
print(X)

[[1 1 1 1]
 [1 1 1 2]
 [1 1 1 3]
 ...
 [5 5 5 3]
 [5 5 5 4]
 [5 5 5 5]]


In [28]:
y = balance_scale_df.iloc[:, 0].values

Разбиваем исходные данные на обучающую и контрольную выборки. Обучающая выборка составляет 80% от объема исходных данных, а контрольная составляет, соответственно, 20%.

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Объявляем *StandardScaler* для нормализации данных.

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Модель MLP

In [31]:
mlp = MLP(X_train.shape[1], len(np.unique(balance_scale_df)))
mlp.fit(X_train, y_train)

In [32]:
y_pred = []
for i in range(len(X_test)):
    y_pred.append(np.argmax(mlp.predict(X_test[i])))

In [33]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 7  1  1]
 [ 0 60  0]
 [ 0  0 56]]
              precision    recall  f1-score   support

           0       1.00      0.78      0.88         9
           1       0.98      1.00      0.99        60
           2       0.98      1.00      0.99        56

    accuracy                           0.98       125
   macro avg       0.99      0.93      0.95       125
weighted avg       0.98      0.98      0.98       125



**Вывод**: реализованная модель протестирована на рекомендуемых датасетах, качество классификации довольно неплохое, но хуже, чем при использовании базовых моделей.