In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets
from sklearn.metrics import classification_report, confusion_matrix, r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
import os
import sys
sys.path.append(os.path.abspath('../modules'))
from MLP import MLP, Sigmoid, Relu 

# Датасет «Titanic»

## Подготовка данных

Импортируем датасет.

In [3]:
titanic_df = pd.read_csv('../data/titanic.csv')

In [4]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Заполняем пропуски средними значениями.

In [6]:
titanic_df = titanic_df.fillna(titanic_df.mean())

Отбрасываем ненужные признаки.

In [7]:
titanic_df = titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

Перекодируем признаки.

In [8]:
mapping = {'male': 0,
           'female': 1 }

titanic_df['Sex'] = titanic_df['Sex'].map(mapping)
titanic_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.000000,1,0,7.2500,S
1,1,1,1,38.000000,1,0,71.2833,C
2,1,3,1,26.000000,0,0,7.9250,S
3,1,1,1,35.000000,1,0,53.1000,S
4,0,3,0,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,13.0000,S
887,1,1,1,19.000000,0,0,30.0000,S
888,0,3,1,29.699118,1,2,23.4500,S
889,1,1,0,26.000000,0,0,30.0000,C


In [9]:
titanic_df = pd.get_dummies(titanic_df, prefix=['c'], columns=['Embarked'])
titanic_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,c_C,c_Q,c_S
0,0,3,0,22.000000,1,0,7.2500,0,0,1
1,1,1,1,38.000000,1,0,71.2833,1,0,0
2,1,3,1,26.000000,0,0,7.9250,0,0,1
3,1,1,1,35.000000,1,0,53.1000,0,0,1
4,0,3,0,35.000000,0,0,8.0500,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,13.0000,0,0,1
887,1,1,1,19.000000,0,0,30.0000,0,0,1
888,0,3,1,29.699118,1,2,23.4500,0,0,1
889,1,1,0,26.000000,0,0,30.0000,1,0,0


In [10]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   c_C       891 non-null    uint8  
 8   c_Q       891 non-null    uint8  
 9   c_S       891 non-null    uint8  
dtypes: float64(2), int64(5), uint8(3)
memory usage: 51.5 KB


In [11]:
X = titanic_df.iloc[:, 1:10].values

In [12]:
y = titanic_df.iloc[:, 0].values

Разбиваем исходные данные на обучающую и контрольную выборки. Обучающая выборка составляет 80% от объема исходных данных, а контрольная составляет, соответственно, 20%.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Объявляем *StandardScaler* для нормализации данных.

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Классификация с помощью модели MLP

In [15]:
mlp = MLP([3, 5, 3, 1], [Sigmoid, Sigmoid, Sigmoid, Sigmoid])
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

In [16]:
titanic_classes = []
for pred in y_pred:
    if pred >= 0.5:
        titanic_classes.append(1)
    else:
        titanic_classes.append(0)

In [17]:
print(confusion_matrix(y_test, titanic_classes))
print(classification_report(y_test, titanic_classes))

[[92  7]
 [22 58]]
              precision    recall  f1-score   support

           0       0.81      0.93      0.86        99
           1       0.89      0.72      0.80        80

    accuracy                           0.84       179
   macro avg       0.85      0.83      0.83       179
weighted avg       0.85      0.84      0.84       179



# Датасет «Iris»

## Подготовка данных

Импортируем датасет.

In [18]:
iris_df = datasets.load_iris()

In [19]:
X = iris_df.data
y = iris_df.target

Разбиваем исходные данные на обучающую и контрольную выборки. Обучающая выборка составляет 80% от объема исходных данных, а контрольная составляет, соответственно, 20%.

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Объявляем *StandardScaler* для нормализации данных.

In [21]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Классификация с помощью модели MLP

In [22]:
mlp = MLP([4, 10, 10, 1], [Sigmoid, Sigmoid, Sigmoid, Sigmoid])
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

In [23]:
iris_classes = []
for pred in y_pred:
    if pred < 0.5:
        iris_classes.append(0)
    elif pred <=1.5:
        iris_classes.append(1)
    elif pred <= 2.5:
        iris_classes.append(2)

In [24]:
print(confusion_matrix(y_test, iris_classes))
print(classification_report(y_test, iris_classes))

[[12  0  0]
 [ 0  6  1]
 [ 0  1 10]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.86      0.86      0.86         7
           2       0.91      0.91      0.91        11

    accuracy                           0.93        30
   macro avg       0.92      0.92      0.92        30
weighted avg       0.93      0.93      0.93        30



# Датасет «Balance Scale»

## Подготовка данных 

Импортируем датасет.

In [25]:
balance_scale_df = pd.read_csv('../data/balance_scale_preprocessed.csv')
balance_scale_df.head()

Unnamed: 0,Class_Name,Left-Weight,Left-Distance,Right-Weight,Right-Distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [26]:
balance_scale_df['Class_Name'] = LabelEncoder().fit_transform(balance_scale_df['Class_Name'].tolist())

In [27]:
X = balance_scale_df.iloc[:, 1:5].values
print(X)

[[1 1 1 1]
 [1 1 1 2]
 [1 1 1 3]
 ...
 [5 5 5 3]
 [5 5 5 4]
 [5 5 5 5]]


In [28]:
y = balance_scale_df.iloc[:, 0].values

Разбиваем исходные данные на обучающую и контрольную выборки. Обучающая выборка составляет 80% от объема исходных данных, а контрольная составляет, соответственно, 20%.

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Объявляем *StandardScaler* для нормализации данных.

In [30]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Модель MLP

In [31]:
mlp = MLP([14, 16, 16, 1], [Sigmoid, Sigmoid, Sigmoid, Sigmoid])
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

In [32]:
balance_classes = []
for pred in y_pred:
    if pred < 0.5:
        balance_classes.append(0)
    elif pred <=1.5:
        balance_classes.append(1)
    elif pred <= 2.5:
        balance_classes.append(2)

In [33]:
print(confusion_matrix(y_test, balance_classes))
print(classification_report(y_test, balance_classes))

[[10  2  0]
 [ 1 52  0]
 [ 2  0 58]]
              precision    recall  f1-score   support

           0       0.77      0.83      0.80        12
           1       0.96      0.98      0.97        53
           2       1.00      0.97      0.98        60

    accuracy                           0.96       125
   macro avg       0.91      0.93      0.92       125
weighted avg       0.96      0.96      0.96       125



# Датасет «Computer Hardware»

## Подготовка данных 

Импортируем датасет.

In [34]:
machine_df = pd.read_csv('../data/machine_preprocessed.csv')
machine_df.head()

Unnamed: 0,Vendor_Name,Model_Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


In [35]:
X = machine_df.iloc[:,2:9].values
print(X)

[[  125   256  6000 ...    16   128   198]
 [   29  8000 32000 ...     8    32   269]
 [   29  8000 32000 ...     8    32   220]
 ...
 [  125  2000  8000 ...     2    14    52]
 [  480   512  8000 ...     0     0    67]
 [  480  1000  4000 ...     0     0    45]]


In [36]:
y = machine_df.iloc[:,-1].values
print(y)

[ 199  253  253  253  132  290  381  381  749 1238   23   24   70  117
   15   64   23   29   22  124   35   39   40   45   28   21   28   22
   28   27  102  102   74   74  138  136   23   29   44   30   41   74
   74   74   54   41   18   28   36   38   34   19   72   36   30   56
   42   34   34   34   34   34   19   75  113  157   18   20   28   33
   47   54   20   23   25   52   27   50   18   53   23   30   73   20
   25   28   29   32  175   57  181  181   32   82  171  361  350  220
  113   15   21   35   18   20   20   28   45   18   17   26   28   28
   31   31   42   76   76   26   59   65  101  116   18   20   20   30
   44   44   82   82  128   37   46   46   80   88   88   33   46   29
   53   53   41   86   95  107  117  119  120   48  126  266  270  426
  151  267  603   19   21   26   35   41   47   62   78   80   80  142
  281  190   21   25   67   24   24   64   25   20   29   43   53   19
   22   31   41   47   99   67   81  149  183  275  382   56  182  227
  341 

Разбиваем исходные данные на обучающую и контрольную выборки. Обучающая выборка составляет 80% от объема исходных данных, а контрольная составляет, соответственно, 20%.

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Объявляем *StandardScaler* для нормализации данных.

In [38]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Модель MLP

In [46]:
mlp = MLP([6, 16, 16, 1], [Sigmoid, Sigmoid, Sigmoid, Sigmoid])
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

In [47]:
print(f"Коэффициент детерминации: {r2_score(y_test, y_pred)}")
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {mean_squared_error(y_test,y_pred, squared=False)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

Коэффициент детерминации: 0.9386683538548619
MSE: 373.5774888438096
RMSE: 19.328152753013143
MAE: 10.6264563296583


**Вывод**: реализованная модель протестирована на рекомендуемых датасетах, качество классификации довольно неплохое, но хуже, чем при использовании базовых моделей.