In [108]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
import tensorflow as tf

In [109]:
# Инициализируем все известные генераторы случаынйх чисел / Setting all known random seeds
my_code = "Олгашов"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit
os.environ['PYTHONHASHSEED']=str(my_seed)
random.seed(my_seed)
np.random.seed(my_seed)
tf.compat.v1.set_random_seed(my_seed)
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

In [110]:
example_data = pd.read_csv("datasets/russian_demography.csv")

In [111]:
example_data.head()

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
0,1990,Republic of Adygea,1.9,14.2,12.3,84.66,52.42
1,1990,Altai Krai,1.8,12.9,11.1,80.24,58.07
2,1990,Amur Oblast,7.6,16.2,8.6,69.55,68.37
3,1990,Arkhangelsk Oblast,3.7,13.5,9.8,73.26,73.63
4,1990,Astrakhan Oblast,4.7,15.1,10.4,77.05,68.01


In [112]:
# Так как список регионов меняется от года к году, в данных есть строки без значений. Удалим их
example_data.dropna(inplace=True)

In [113]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(example_data))
print(val_test_size)

463


In [114]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(example_data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

1389 463 463


In [115]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
columns_to_scale = ['year', 'npg', 'birth_rate', 'death_rate', 'gdw', 'urbanization']
ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['year', 'npg', 'birth_rate', 'death_rate',
                                  'gdw', 'urbanization'])])

In [116]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [117]:
# Устанавливаем названия столбцов
column_names = columns_to_scale + ['region']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [118]:
sc_train

Unnamed: 0,year,npg,birth_rate,death_rate,gdw,urbanization,region
0,0.814815,0.449631,0.364807,0.432558,0.466551,0.5619,Astrakhan Oblast
1,0.185185,0.208845,0.094421,0.595349,0.765943,0.505197,Oryol Oblast
2,0.037037,0.420147,0.236052,0.348837,0.666725,0.66557,Novosibirsk Oblast
3,0.111111,0.380835,0.2103,0.395349,0.72424,0.467044,Republic of Buryatia
4,0.814815,0.432432,0.300429,0.395349,0.372024,0.617156,Tomsk Oblast
...,...,...,...,...,...,...,...
1384,0.333333,0.746929,0.600858,0.125581,0.82172,0.237074,Republic of Ingushetia
1385,0.481481,0.304668,0.236052,0.562791,0.432493,0.575319,Astrakhan Oblast
1386,0.62963,0.19656,0.150215,0.688372,0.472632,0.514538,Kursk Oblast
1387,0.814815,0.395577,0.257511,0.418605,0.413901,1.0,Saint Petersburg


In [119]:
c_models = []
c_models.append(MultinomialNB(alpha=0.0))
c_models.append(MultinomialNB(alpha=0.5))
c_models.append(MultinomialNB(alpha=1.0))

In [120]:
# Выделим предикторы и метки классов
x_labels = column_names[0:-1]
y_labels = ['region']
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]
y_train = np.ravel(sc_train[y_labels])
y_test = np.ravel(sc_test[y_labels])
y_val = np.ravel(sc_val[y_labels])

In [121]:
# Обучаем модели
for model in c_models:
    model.fit(x_train, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


In [122]:
# Оценииваем качество работы моделей на валидационной выборке.
f1s = []
for model in c_models:
    val_pred = model.predict(x_val)
    f1 = f1_score(y_val, val_pred, average='weighted')
    f1s.append(f1)
    print(f1)

0.01605282717718498
0.011570052171880238
0.009130685286305925


In [123]:
clas = MLPClassifier(alpha=0.0, batch_size=16, epsilon=1e-07, max_iter=50)
clas.get_params()


{'activation': 'relu',
 'alpha': 0.0,
 'batch_size': 16,
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-07,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 50,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [124]:
clas.fit(x_train, y_train)




MLPClassifier(alpha=0.0, batch_size=16, epsilon=1e-07, max_iter=50)

In [125]:
pred_val = clas.predict(x_val)

In [127]:
f1_2 = f1_score(y_val, pred_val, average='weighted')
print(f1_2)

0.25309013916797407


In [129]:
print(f1s,f1_2)

[0.01605282717718498, 0.011570052171880238, 0.009130685286305925] 0.25309013916797407
