# Логистическая регрессия

Практическая лекция по логистической регрессии.

На этом занятии мы будем работать с тем же набором данных, что и в занятии про Knn. Так что посмотреть, что мы делали с данными, можнно там.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
data = pd.read_csv('../data/Pokemon.csv', index_col=0).reset_index(drop=True)

In [4]:
data.head()

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 12 columns):
Name          800 non-null object
Type 1        800 non-null object
Type 2        414 non-null object
Total         800 non-null int64
HP            800 non-null int64
Attack        800 non-null int64
Defense       800 non-null int64
Sp. Atk       800 non-null int64
Sp. Def       800 non-null int64
Speed         800 non-null int64
Generation    800 non-null int64
Legendary     800 non-null bool
dtypes: bool(1), int64(8), object(3)
memory usage: 69.6+ KB


In [6]:
num_cols = [col for col in data.columns if data[col].dtype == 'float64' or data[col].dtype == 'int64']

In [7]:
data.isnull().sum()/data.shape[0]

Name          0.0000
Type 1        0.0000
Type 2        0.4825
Total         0.0000
HP            0.0000
Attack        0.0000
Defense       0.0000
Sp. Atk       0.0000
Sp. Def       0.0000
Speed         0.0000
Generation    0.0000
Legendary     0.0000
dtype: float64

In [8]:
data.loc[data['Type 2'].isnull(), 'Type 2'] = 'NoneType'

## Преобразование категориальных признаков 

Линейным моделям не страшна размерность пространства признаков, так что для категориальных добавим дамми переменные. Но LabelEncoding тоже сохраним.

In [9]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['Type 1'] = encoder.fit_transform(data['Type 1'])
data['Type 2'] = encoder.fit_transform(data['Type 2'])

In [10]:
type1 = pd.get_dummies(data['Type 1'], drop_first=True)
type1.columns = ['type1_'+str(i) for i in range(type1.shape[1])]
type2 = pd.get_dummies(data['Type 2'], drop_first=True)
type2.columns = ['type2_'+str(i) for i in range(type2.shape[1])]


In [11]:
data.drop('Name', axis=1, inplace=True)

In [12]:
data = pd.concat([data, type1, type2], axis=1)

In [14]:
data.describe()

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,...,type2_8,type2_9,type2_10,type2_11,type2_12,type2_13,type2_14,type2_15,type2_16,type2_17
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,...,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,9.4675,10.49125,435.1025,69.25875,79.00125,73.8425,72.82,71.9025,68.2775,3.32375,...,0.03125,0.04375,0.0175,0.4825,0.005,0.0425,0.04125,0.0175,0.0275,0.0175
std,5.580356,3.750334,119.96304,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474,1.66129,...,0.174101,0.204666,0.131207,0.500006,0.070578,0.201853,0.198992,0.131207,0.163637,0.131207
min,0.0,0.0,180.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,7.0,330.0,50.0,55.0,50.0,49.75,50.0,45.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.0,12.0,450.0,65.0,75.0,70.0,65.0,70.0,65.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,14.0,12.0,515.0,80.0,100.0,90.0,95.0,90.0,90.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,17.0,18.0,780.0,255.0,190.0,230.0,194.0,230.0,180.0,6.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
data.head()

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,...,type2_8,type2_9,type2_10,type2_11,type2_12,type2_13,type2_14,type2_15,type2_16,type2_17
0,9,14,318,45,49,49,65,65,45,1,...,0,0,0,0,0,1,0,0,0,0
1,9,14,405,60,62,63,80,80,60,1,...,0,0,0,0,0,1,0,0,0,0
2,9,14,525,80,82,83,100,100,80,1,...,0,0,0,0,0,1,0,0,0,0
3,9,14,625,80,100,123,122,120,80,1,...,0,0,0,0,0,1,0,0,0,0
4,6,12,309,39,52,43,60,50,65,1,...,0,0,0,1,0,0,0,0,0,0


Добавим все двойные кросс признаки. Константный признак включать не будем.

In [15]:
from sklearn.preprocessing import PolynomialFeatures

pol_feat = PolynomialFeatures(degree=2, include_bias=False)
pol = pd.DataFrame(pol_feat.fit_transform(data.drop('Legendary', axis=1))).reset_index(drop=True)
pol.columns = ['pol_'+str(i) for i in range(pol.shape[1])]

In [16]:
pol.head()

Unnamed: 0,pol_0,pol_1,pol_2,pol_3,pol_4,pol_5,pol_6,pol_7,pol_8,pol_9,...,pol_1070,pol_1071,pol_1072,pol_1073,pol_1074,pol_1075,pol_1076,pol_1077,pol_1078,pol_1079
0,9.0,14.0,318.0,45.0,49.0,49.0,65.0,65.0,45.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9.0,14.0,405.0,60.0,62.0,63.0,80.0,80.0,60.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9.0,14.0,525.0,80.0,82.0,83.0,100.0,100.0,80.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.0,14.0,625.0,80.0,100.0,123.0,122.0,120.0,80.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.0,12.0,309.0,39.0,52.0,43.0,60.0,50.0,65.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
data = pd.concat([data, pol], axis=1)

In [18]:
data.head()

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,...,pol_1070,pol_1071,pol_1072,pol_1073,pol_1074,pol_1075,pol_1076,pol_1077,pol_1078,pol_1079
0,9,14,318,45,49,49,65,65,45,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9,14,405,60,62,63,80,80,60,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9,14,525,80,82,83,100,100,80,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9,14,625,80,100,123,122,120,80,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,12,309,39,52,43,60,50,65,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Построение модели логистической регрессии

## Train Test Split

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('Legendary', axis=1), 
                                                    data['Legendary'], test_size=0.3, 
                                                    random_state=42)

## Training and Predicting

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
logmodel = LogisticRegression(C=2, max_iter=1000, 
                              warm_start=True, random_state=42)
logmodel.fit(X_train, y_train)

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=True)

In [23]:
predictions = logmodel.predict_proba(X_test)[:, 1]

## Оценка качества модели

In [24]:
from sklearn.metrics import roc_auc_score

In [25]:
print(roc_auc_score(y_test, predictions))

0.8503703703703703


## Cross Validation

In [26]:
target = data['Legendary']*1

In [27]:
from sklearn.model_selection import KFold


In [28]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
val_rate = []
for tr_ind, val_ind in kf.split(data):
    lr = LogisticRegression(C=2, max_iter=1000,
                            warm_start=True, random_state=42)
    train = data.drop('Legendary', axis=1).loc[tr_ind]
    val = data.drop('Legendary', axis=1).loc[val_ind]

    target_train = target[tr_ind]
    target_val = target[val_ind]

    lr.fit(train, target_train)
    pred_i = lr.predict_proba(val)[:, 1]
    val_rate.append(roc_auc_score(target_val, pred_i))

In [29]:
print('Среднее: {:.3f}\nCтандартное отклонение: {:.3f}'.format(np.mean(val_rate), np.std(val_rate)))

Среднее: 0.917
Cтандартное отклонение: 0.074
