<a href="https://colab.research.google.com/github/ribalchusy/machine-learning/blob/main/MACHINE_LEARNING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# загрузка пакетов: инструменты -----------------------
# работа с массивами
import numpy as np
# фреймы данных
import pandas as pd
# графики
import matplotlib as mpl
# стили и шаблоны графиков на основе matplotlib
import seaborn as sns
# тест Шапиро-Уилка на нормальность распределения
from scipy.stats import shapiro
# тест Лиллиефорса на нормальность распределения
from statsmodels.stats.diagnostic import lilliefors
# загрузка пакетов: модели ----------------------------
# логистическая регрессия (ММП)
from sklearn.linear_model import LogisticRegression
# линейный дискриминантный анализ (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# квадратичный дискриминантный анализ (QDA)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# матрица неточностей
from sklearn.metrics import classification_report, confusion_matrix
# PPV (TP / (TP + FP))
from sklearn.metrics import precision_score
# расчёт TPR, SPC, F1
from sklearn.metrics import precision_recall_fscore_support
# подготовка матрицы X для модели регрессии
from statsmodels.api import add_constant
# модель логистической регрессии
from statsmodels.formula.api import logit
plt = mpl.pyplot
# дерево классификации
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
# перекрёстная проверка и метод проверочной выборки
from sklearn.model_selection import cross_val_score, train_test_split
# для перекрёстной проверки и сеточного поиска
from sklearn.model_selection import KFold, GridSearchCV
# бэггинг
from sklearn.ensemble import BaggingClassifier
# случайный лес
from sklearn.ensemble import RandomForestClassifier
# бустинг
from sklearn.ensemble import GradientBoostingClassifier
# сводка по точности классификации
from sklearn.metrics import classification_report

## Загрузка данных


In [2]:
URL = "https://raw.githubusercontent.com/avtararin/machine_learning/main/sem_rabota/thermopoints.csv"
DF = pd.read_csv(URL)
DF.head()

Unnamed: 0,dt,type_name,type_id,lon,lat
0,2012-03-13,Природный пожар,4,131.5866,47.8662
1,2012-03-13,Природный пожар,4,131.5885,47.8809
2,2012-03-13,Лесной пожар,3,131.9871,48.4973
3,2012-03-13,Природный пожар,4,131.9031,43.6277
4,2012-03-13,Природный пожар,4,131.5706,47.8581


## Преобразование данных
Выполнены следующие преобразования:
1.   Фиктивные перменные типов пожаров
2.   Фиктивные перменные для всех месяцев в году
3. Фиктивные перменные для дней недели



In [3]:
df_dummy = pd.get_dummies(DF[['type_name']])
DF = pd.concat([DF.reset_index(drop=True), df_dummy], axis=1)
DF['month'] = pd.DatetimeIndex(DF['dt']). month
DF['dow'] = pd.to_datetime(DF['dt']).dt.dayofweek
print(DF.shape)
DF.head()

(660254, 12)


Unnamed: 0,dt,type_name,type_id,lon,lat,type_name_Контролируемый пал,type_name_Лесной пожар,type_name_Неконтролируемый пал,type_name_Природный пожар,type_name_Торфяной пожар,month,dow
0,2012-03-13,Природный пожар,4,131.5866,47.8662,False,False,False,True,False,3,1
1,2012-03-13,Природный пожар,4,131.5885,47.8809,False,False,False,True,False,3,1
2,2012-03-13,Лесной пожар,3,131.9871,48.4973,False,True,False,False,False,3,1
3,2012-03-13,Природный пожар,4,131.9031,43.6277,False,False,False,True,False,3,1
4,2012-03-13,Природный пожар,4,131.5706,47.8581,False,False,False,True,False,3,1


In [4]:
dummy_month = pd.get_dummies(DF['month'])
DF = pd.concat([DF, dummy_month], axis=1)

In [5]:
DF = DF.rename(columns={'type_name_Контролируемый пал' : 'Контролируемый пожар', 'type_name_Лесной пожар' : 'Лесной пожар', 'type_name_Неконтролируемый пал' : 'Неконтролируемый пожар', 'type_name_Природный пожар' : 'Природный пожар', 'type_name_Торфяной пожар':'Торфяной пожар'})
DF['Контролируемый пожар'] = DF['Контролируемый пожар'].astype(int)
DF['Лесной пожар'] = DF['Лесной пожар'].astype(int)
DF['Неконтролируемый пожар'] = DF['Неконтролируемый пожар'].astype(int)
DF['Природный пожар'] = DF['Природный пожар'].astype(int)
DF['Торфяной пожар'] = DF['Торфяной пожар'].astype(int)
DF[1] = DF[1].astype(int)
DF[2] = DF[2].astype(int)
DF[3] = DF[3].astype(int)
DF[4] = DF[4].astype(int)
DF[5] = DF[5].astype(int)
DF[6] = DF[6].astype(int)
DF[7] = DF[7].astype(int)
DF[8] = DF[8].astype(int)
DF[9] = DF[9].astype(int)
DF[10] = DF[10].astype(int)
DF[11] = DF[11].astype(int)
DF[12] = DF[12].astype(int)
DF = DF.rename(columns={1 : 'Январь',  2: 'Февраль', 3 : 'Март', 4 : 'Апрель', 5:'Май', 6 : 'Июнь', 7 : 'Июль', 8 : 'Август', 9 : 'Сентябрь', 10 : 'Октябрь' , 11 : 'Ноябрь', 12 : 'Декабрь'})
DF.head()

Unnamed: 0,dt,type_name,type_id,lon,lat,Контролируемый пожар,Лесной пожар,Неконтролируемый пожар,Природный пожар,Торфяной пожар,...,Март,Апрель,Май,Июнь,Июль,Август,Сентябрь,Октябрь,Ноябрь,Декабрь
0,2012-03-13,Природный пожар,4,131.5866,47.8662,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,2012-03-13,Природный пожар,4,131.5885,47.8809,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,2012-03-13,Лесной пожар,3,131.9871,48.4973,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2012-03-13,Природный пожар,4,131.9031,43.6277,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,2012-03-13,Природный пожар,4,131.5706,47.8581,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
dummy_dow = pd.get_dummies(DF['dow'])
DF = pd.concat([DF, dummy_dow], axis=1)

In [7]:
DF[0] = DF[0].astype(int)
DF[1] = DF[1].astype(int)
DF[2] = DF[2].astype(int)
DF[3] = DF[3].astype(int)
DF[4] = DF[4].astype(int)
DF[5] = DF[5].astype(int)
DF[6] = DF[6].astype(int)
DF = DF.rename(columns={0: 'Понедельник', 1 : 'Вторник',  2: 'Среда', 3 : 'Четверг', 4 : 'Пятница', 5:'Суббота', 6 : 'Воскрсенье'})


In [8]:
DF.iloc[:7]

Unnamed: 0,dt,type_name,type_id,lon,lat,Контролируемый пожар,Лесной пожар,Неконтролируемый пожар,Природный пожар,Торфяной пожар,...,Октябрь,Ноябрь,Декабрь,Понедельник,Вторник,Среда,Четверг,Пятница,Суббота,Воскрсенье
0,2012-03-13,Природный пожар,4,131.5866,47.8662,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,2012-03-13,Природный пожар,4,131.5885,47.8809,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,2012-03-13,Лесной пожар,3,131.9871,48.4973,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2012-03-13,Природный пожар,4,131.9031,43.6277,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,2012-03-13,Природный пожар,4,131.5706,47.8581,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
5,2012-03-14,Природный пожар,4,131.5798,52.1321,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
6,2012-03-15,Природный пожар,4,20.4452,54.8597,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


## Анализ пропусков

In [9]:
print(DF.isna().sum())
print(DF.dtypes)

dt                        0
type_name                 0
type_id                   0
lon                       0
lat                       0
Контролируемый пожар      0
Лесной пожар              0
Неконтролируемый пожар    0
Природный пожар           0
Торфяной пожар            0
month                     0
dow                       0
Январь                    0
Февраль                   0
Март                      0
Апрель                    0
Май                       0
Июнь                      0
Июль                      0
Август                    0
Сентябрь                  0
Октябрь                   0
Ноябрь                    0
Декабрь                   0
Понедельник               0
Вторник                   0
Среда                     0
Четверг                   0
Пятница                   0
Суббота                   0
Воскрсенье                0
dtype: int64
dt                         object
type_name                  object
type_id                     int64
lon              

Пропусков не обнаружено

## Предварительный анализ данных
Разделение выборок на train и test, удаление ненужных перменных

In [10]:
DF = DF.drop(['Контролируемый пожар', 'Неконтролируемый пожар', 'Природный пожар', 'Торфяной пожар', 'type_name', 'type_id', 'dt', 'dow', 'month', 'month'], axis=1)
DF_train = DF.sample(frac = 0.8, random_state = 18)

DF_test = DF.drop(DF_train.index)
y_test = DF_test['Лесной пожар']
DF_test = DF_test.drop(['Лесной пожар'], axis = 1)
X_test = DF_test
DF_test.head()

Unnamed: 0,lon,lat,Январь,Февраль,Март,Апрель,Май,Июнь,Июль,Август,...,Октябрь,Ноябрь,Декабрь,Понедельник,Вторник,Среда,Четверг,Пятница,Суббота,Воскрсенье
2,131.9871,48.4973,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,131.5798,52.1321,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,125.8202,55.209,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11,132.262,44.0062,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
17,91.054,57.1111,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


Описательные статистики

In [11]:
DF_train.describe()

Unnamed: 0,lon,lat,Лесной пожар,Январь,Февраль,Март,Апрель,Май,Июнь,Июль,...,Октябрь,Ноябрь,Декабрь,Понедельник,Вторник,Среда,Четверг,Пятница,Суббота,Воскрсенье
count,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0,...,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0,528203.0
mean,107.823182,57.054645,0.464231,0.001816,0.006781,0.064954,0.215722,0.109108,0.088627,0.222507,...,0.036092,0.014241,0.001812,0.135838,0.138345,0.139113,0.144954,0.148102,0.146211,0.147438
std,29.388175,6.306717,0.498719,0.042571,0.08207,0.246445,0.411323,0.311775,0.284205,0.41593,...,0.18652,0.118482,0.042527,0.342617,0.345262,0.346065,0.352055,0.355202,0.353318,0.354542
min,19.8622,41.3805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,90.8626,52.5459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,115.0273,56.4783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,130.6953,62.4902,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,178.4351,72.7414,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
corr_mat = DF.corr()
styled_corr = corr_mat.style.background_gradient(cmap='coolwarm').format("{:.2f}")
styled_corr

Unnamed: 0,lon,lat,Лесной пожар,Январь,Февраль,Март,Апрель,Май,Июнь,Июль,Август,Сентябрь,Октябрь,Ноябрь,Декабрь,Понедельник,Вторник,Среда,Четверг,Пятница,Суббота,Воскрсенье
lon,1.0,0.22,0.31,0.02,0.02,-0.03,-0.2,-0.11,0.17,0.12,0.1,-0.02,-0.07,0.02,0.0,0.01,-0.02,-0.01,0.01,-0.0,0.01,0.01
lat,0.22,1.0,0.5,-0.07,-0.14,-0.28,-0.4,-0.15,0.2,0.38,0.32,0.04,-0.18,-0.17,-0.06,-0.01,0.0,-0.01,-0.0,0.01,0.01,-0.0
Лесной пожар,0.31,0.5,1.0,-0.04,-0.07,-0.23,-0.37,-0.1,0.15,0.29,0.3,0.03,-0.14,-0.1,-0.04,-0.0,-0.01,0.0,-0.0,0.01,0.0,-0.0
Январь,0.02,-0.07,-0.04,1.0,-0.0,-0.01,-0.02,-0.01,-0.01,-0.02,-0.02,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.01,-0.0
Февраль,0.02,-0.14,-0.07,-0.0,1.0,-0.02,-0.04,-0.03,-0.03,-0.04,-0.04,-0.02,-0.02,-0.01,-0.0,0.0,0.01,0.01,0.0,-0.01,-0.0,-0.01
Март,-0.03,-0.28,-0.23,-0.01,-0.02,1.0,-0.14,-0.09,-0.08,-0.14,-0.12,-0.06,-0.05,-0.03,-0.01,0.01,-0.01,-0.0,-0.01,0.01,-0.01,0.0
Апрель,-0.2,-0.4,-0.37,-0.02,-0.04,-0.14,1.0,-0.18,-0.16,-0.28,-0.25,-0.13,-0.1,-0.06,-0.02,0.01,0.01,-0.0,0.0,-0.01,-0.0,-0.0
Май,-0.11,-0.15,-0.1,-0.01,-0.03,-0.09,-0.18,1.0,-0.11,-0.19,-0.17,-0.09,-0.07,-0.04,-0.01,-0.01,0.01,-0.01,0.0,-0.01,0.01,0.01
Июнь,0.17,0.2,0.15,-0.01,-0.03,-0.08,-0.16,-0.11,1.0,-0.17,-0.15,-0.08,-0.06,-0.04,-0.01,0.01,0.02,-0.01,-0.0,-0.01,0.01,-0.01
Июль,0.12,0.38,0.29,-0.02,-0.04,-0.14,-0.28,-0.19,-0.17,1.0,-0.25,-0.13,-0.1,-0.06,-0.02,-0.03,-0.0,0.02,0.01,0.02,-0.01,-0.02


## Логистическая регрессия
В качестве объясняемой перменной возьмем Лесной пожар, а объясняющие перменные - lon, lat, месяца и дни недели

In [13]:
X_train = DF_train.drop(['Лесной пожар'], axis = 1)
y_train = DF_train['Лесной пожар']
fit_LR_1 = LogisticRegression().fit(X=X_train, y = y_train)

print('Коэффициенты при объясняющих переменных: ', np.round(fit_LR_1.coef_, 4), '\n Константа', np.around(fit_LR_1.intercept_, 4))

Коэффициенты при объясняющих переменных:  [[ 0.0209  0.0414 -0.712  -1.719  -2.8247 -1.0207  0.2497  1.2613  1.5107
   1.8589  0.9186 -0.9616 -2.5565 -0.7124 -0.6583 -0.6755 -0.6327 -0.7059
  -0.5897 -0.7187 -0.7268]] 
 Константа [-4.7131]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
fit_LR_1.coef_

array([[ 0.02089763,  0.04135082, -0.71204223, -1.71896115, -2.82473361,
        -1.020707  ,  0.24974633,  1.26125306,  1.51073279,  1.85886791,
         0.91859366, -0.96157061, -2.55648429, -0.71235899, -0.65833375,
        -0.67547487, -0.63274869, -0.70590327, -0.58972151, -0.71868291,
        -0.72679911]])

In [15]:
fit_LR_1.intercept_


array([-4.71313982])

In [16]:
X_test.head()

Unnamed: 0,lon,lat,Январь,Февраль,Март,Апрель,Май,Июнь,Июль,Август,...,Октябрь,Ноябрь,Декабрь,Понедельник,Вторник,Среда,Четверг,Пятница,Суббота,Воскрсенье
2,131.9871,48.4973,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,131.5798,52.1321,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,125.8202,55.209,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11,132.262,44.0062,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
17,91.054,57.1111,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [17]:
# отчёт по точности на тестовой
y_prob_test_LR_1 = fit_LR_1.predict_proba(X_test).reshape(2, -1)[1]
y_hat_test = (y_prob_test_LR_1 > 0.5).astype(int)
print('Модель логистической регрессии от studentYes, balance ',
      'с порогом 0.5 : \n',
        classification_report(y_test, y_hat_test))

Модель логистической регрессии от studentYes, balance  с порогом 0.5 : 
               precision    recall  f1-score   support

           0       0.54      0.50      0.52     70706
           1       0.46      0.50      0.48     61345

    accuracy                           0.50    132051
   macro avg       0.50      0.50      0.50    132051
weighted avg       0.50      0.50      0.50    132051



# Градиентный бустинг

In [18]:
# разбиения для перекрёстной проверки
kfold = KFold(n_splits=5, random_state=15, shuffle=True)
# обучаем модель с параметрами по умолчанию
clf_tst = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
 max_depth=1, random_state=15)
cv = cross_val_score(clf_tst, X_train, y_train, cv=kfold, scoring='accuracy')
np.around(np.mean(cv), 3)

0.843

In [20]:
# настроим параметры бустинга с помощью сеточного поиска
param_grid = {'n_estimators' : [10, 20, 30, 40, 50],
 'learning_rate' : np.linspace(start=0.01, stop=0.25, num=15),
 'max_depth' : [1, 2]}
# таймер
clf = GridSearchCV(GradientBoostingClassifier(),
 param_grid, scoring='accuracy', cv=kfold)
boost_tree = clf.fit(X_train, y_train)


In [21]:
boost_tree.best_score_

0.8523086797386823

In [22]:
# параметры лучшей модели
print('n_estimators:',
 boost_tree.best_estimator_.get_params()['n_estimators'],
 '\nlearning_rate:',
 boost_tree.best_estimator_.get_params()['learning_rate'],
 '\nmax_depth:',
 boost_tree.best_estimator_.get_params()['max_depth'])

n_estimators: 50 
learning_rate: 0.25 
max_depth: 2


In [26]:
# строим прогноз
y_hat = boost_tree.best_estimator_.predict(X_test)
# характеристики точности
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86     70706
           1       0.83      0.85      0.84     61345

    accuracy                           0.85    132051
   macro avg       0.85      0.85      0.85    132051
weighted avg       0.85      0.85      0.85    132051

