In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour

import category_encoders as ce

plt.style.use('seaborn-colorblind')
%matplotlib inline

In [2]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

data = pd.read_csv('/Users/muzalevskiy/Desktop/modules/titanic.csv', usecols=use_cols,sep='\t')
print(data.shape)
data.head(8)

(156, 6)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.25
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.925
3,1,1,female,35.0,1,53.1
4,0,3,male,35.0,0,8.05
5,0,3,male,,0,8.4583
6,0,1,male,54.0,0,51.8625
7,0,3,male,2.0,3,21.075


## Пропущенные значения

Проверка пропущенных значений

In [3]:
def check_missing(data,output_path=None):    
    result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1)
    result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
    if output_path is not None:
        result.to_csv(output_path+'missing.csv')
        print(output_path, 'missing.csv')
    return result

In [4]:
check_missing(data=data)

Unnamed: 0,total missing,proportion
Survived,0,0.0
Pclass,0,0.0
Sex,0,0.0
Age,30,0.192308
SibSp,0,0.0
Fare,0,0.0


Удаление пропущенных значений

In [5]:
def drop_missing(data,axis=0):
    data_copy = data.copy(deep=True)
    data_copy = data_copy.dropna(axis=axis,inplace=False)
    return data_copy

In [6]:
data2 = drop_missing(data=data)
data2.shape

(126, 6)

Добавление переменной оценки пропущенных значений

In [7]:
def add_var_denote_NA(data,NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0)
        else:
            warn("Нет пропущенных значений" % i)          
    return data_copy

In [8]:
data3 = add_var_denote_NA(data=data,NA_col=['Age'])
print(data3.Age_is_NA.value_counts())
data3.head(8)

0    126
1     30
Name: Age_is_NA, dtype: int64


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_is_NA
0,0,3,male,22.0,1,7.25,0
1,1,1,female,38.0,1,71.2833,0
2,1,3,female,26.0,0,7.925,0
3,1,1,female,35.0,1,53.1,0
4,0,3,male,35.0,0,8.05,0
5,0,3,male,,0,8.4583,1
6,0,1,male,54.0,0,51.8625,0
7,0,3,male,2.0,3,21.075,0


Заполнение пропусков выборочным значением

In [9]:
def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):  
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy

In [10]:
data4 = impute_NA_with_arbitrary(data=data,impute_value=-999,NA_col=['Age'])
data4.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_-999
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,-999.0
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


Заполнение пропущенных значений средним/медианой/модой

In [11]:
def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            if strategy=='mean':
                data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean())
            elif strategy=='median':
                data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median())
            elif strategy=='mode':
                data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy  

In [12]:
print(data.Age.median())
data5 = impute_NA_with_avg(data=data,strategy='median',NA_col=['Age'])
data5.head(8)

26.0


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_impute_median
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,26.0
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


Заполнение пропусков значением из "хвоста" распределения

In [13]:
def impute_NA_with_end_of_distribution(data,NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std())
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy     

In [14]:
data6 = impute_NA_with_end_of_distribution(data=data,NA_col=['Age'])
data6.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_impute_end_of_distri
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,71.983148
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


Заполнение пропусков случайными значениями

In [15]:
def impute_NA_with_random(data,NA_col=[],random_state=0):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_random'] = data_copy[i]
            random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
            random_sample.index = data_copy[data_copy[i].isnull()].index
            data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
        else:
            warn("Нет пропущенных значений" % i)
    return data_copy 

In [16]:
data7 = impute_NA_with_random(data=data,NA_col=['Age'])
data7.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_random
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,14.0
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


## Выбросы

Детекция с помощью выборочных значений

In [17]:
def outlier_detect_arbitrary(data,col,upper_fence,lower_fence):
    para = (upper_fence, lower_fence)
    tmp = pd.concat([data[col]>upper_fence,data[col]<lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))    
    return outlier_index, para

In [18]:
index,para = outlier_detect_arbitrary(data=data,col='Fare',upper_fence=100,lower_fence=5)
print('Верхняя граница:',para[0],'\nНижняя граница:',para[1])

Количество выбросов в данных: 4
Доля выбросов: 0.025641025641
Верхняя граница: 100 
Нижняя граница: 5


In [19]:
data.loc[index,'Fare'].sort_values()

31     146.5208
118    247.5208
27     263.0000
88     263.0000
Name: Fare, dtype: float64

Интерквартильное расстояние

In [20]:
def outlier_detect_IQR(data,col,threshold=3):    
    IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
    Lower_fence = data[col].quantile(0.25) - (IQR * threshold)
    Upper_fence = data[col].quantile(0.75) + (IQR * threshold)
    para = (Upper_fence, Lower_fence)
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [21]:
index,para = outlier_detect_IQR(data=data,col='Fare',threshold=5)
print('Верхняя граница:',para[0],'\nНижняя граница:',para[1])

Количество выбросов в данных: 4
Доля выбросов: 0.025641025641
Верхняя граница: 142.21535 
Нижняя граница: -103.84035


In [22]:
data.loc[index,'Fare'].sort_values()

31     146.5208
118    247.5208
27     263.0000
88     263.0000
Name: Fare, dtype: float64

Среднее-среднеквадратичное отклонение

In [23]:
def outlier_detect_mean_std(data,col,threshold=3):
    Upper_fence = data[col].mean() + threshold * data[col].std()
    Lower_fence = data[col].mean() - threshold * data[col].std()   
    para = (Upper_fence, Lower_fence)   
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [24]:
index,para = outlier_detect_mean_std(data=data,col='Fare',threshold=3)
print('Верхняя граница:',para[0],'\nНижняя граница:',para[1])

Количество выбросов в данных: 4
Доля выбросов: 0.025641025641
Верхняя граница: 146.31272726896492 
Нижняя граница: -90.09355290999055


In [25]:
data.loc[index,'Fare'].sort_values()

31     146.5208
118    247.5208
27     263.0000
88     263.0000
Name: Fare, dtype: float64

Медианы абсолютного отклонения (MAD) 

In [26]:
def outlier_detect_MAD(data,col,threshold=3.5):
    median = data[col].median()
    median_absolute_deviation = np.median([np.abs(y - median) for y in data[col]])
    modified_z_scores = pd.Series([0.6745 * (y - median) / median_absolute_deviation for y in data[col]])
    outlier_index = np.abs(modified_z_scores) > threshold
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index

In [27]:
index = outlier_detect_MAD(data=data,col='Fare',threshold=3.5)

Количество выбросов в данных: 25
Доля выбросов: 0.160256410256


Замена выброса выборочным значением

In [28]:
def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]):
    data_copy = data.copy(deep=True)
    for i in col:
        data_copy.loc[outlier_index,i] = value
    return data_copy

In [29]:
data2 = impute_outlier_with_arbitrary(data=data,outlier_index=index,
                                         value=-999,col=['Fare'])
data2[25:35]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
25,1,3,female,38.0,1,31.3875
26,0,3,male,,0,7.225
27,0,1,male,19.0,3,-999.0
28,1,3,female,,0,7.8792
29,0,3,male,,0,7.8958
30,0,1,male,40.0,0,27.7208
31,1,1,female,,1,-999.0
32,1,3,female,,0,7.75
33,0,2,male,66.0,0,10.5
34,0,1,male,28.0,1,-999.0


Виндзоризация

{92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, −40, 101, 86, 85, 15, 89, 89, 28, −5, 41}  

{92, 19, 101, 58, 101, 91, 26, 78, 10, 13, −5, 101, 86, 85, 15, 89, 89, 28, −5, 41} 

In [30]:
def windsorization(data,col,para,strategy='both'):
    data_copy = data.copy(deep=True)  
    if strategy == 'both':
        data_copy.loc[data_copy[col]>para[0],col] = para[0]
        data_copy.loc[data_copy[col]<para[1],col] = para[1]
    elif strategy == 'top':
        data_copy.loc[data_copy[col]>para[0],col] = para[0]
    elif strategy == 'bottom':
        data_copy.loc[data_copy[col]<para[1],col] = para[1]  
    return data_copy

In [31]:
data3 = windsorization(data=data,col='Fare',para=para,strategy='both')
data3[25:35]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
25,1,3,female,38.0,1,31.3875
26,0,3,male,,0,7.225
27,0,1,male,19.0,3,146.312727
28,1,3,female,,0,7.8792
29,0,3,male,,0,7.8958
30,0,1,male,40.0,0,27.7208
31,1,1,female,,1,146.312727
32,1,3,female,,0,7.75
33,0,2,male,66.0,0,10.5
34,0,1,male,28.0,1,82.1708


Удаление выбросов

In [32]:
def drop_outlier(data,outlier_index):
    data_copy = data[~outlier_index]
    return data_copy

In [33]:
data4 = drop_outlier(data=data,outlier_index=index)
print(data4.Fare.max())
print(data4.Fare.min())

47.1
6.75


Замена выбросов средним/медианой/модой

In [34]:
def impute_outlier_with_avg(data,col,outlier_index,strategy='mean'):
    data_copy = data.copy(deep=True)
    if strategy=='mean':
        data_copy.loc[outlier_index,col] = data_copy[col].mean()
    elif strategy=='median':
        data_copy.loc[outlier_index,col] = data_copy[col].median()
    elif strategy=='mode':
        data_copy.loc[outlier_index,col] = data_copy[col].mode()[0]           
    return data_copy

In [35]:
data5 = impute_outlier_with_avg(data=data,col='Fare',
                                   outlier_index=index,strategy='mean')
data5[25:35]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
25,1,3,female,38.0,1,31.3875
26,0,3,male,,0,7.225
27,0,1,male,19.0,3,28.109587
28,1,3,female,,0,7.8792
29,0,3,male,,0,7.8958
30,0,1,male,40.0,0,27.7208
31,1,1,female,,1,28.109587
32,1,3,female,,0,7.75
33,0,2,male,66.0,0,10.5
34,0,1,male,28.0,1,28.109587


##  Шкалирование данных

In [36]:
X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((124, 6), (32, 6))

Нормализация данных

In [37]:
ss = StandardScaler().fit(X_train[['Fare']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['Fare_zscore'] = ss.transform(X_train_copy[['Fare']])
print(X_train_copy.head(6))

     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_zscore
132         0       3  female  47.0      1  14.5000    -0.342483
97          1       1    male  23.0      0  63.3583     0.949028
22          1       3  female  15.0      0   8.0292    -0.513531
80          0       3    male  22.0      0   9.0000    -0.487869
101         0       3    male   NaN      0   7.8958    -0.517057
102         0       1    male  21.0      0  77.2875     1.317229


In [38]:
print(X_train_copy['Fare_zscore'].mean())
print(X_train_copy['Fare_zscore'].std())

-3.4918304806758956e-17
1.004056811789459


Мин-Макс шкалирование

In [39]:
mms = MinMaxScaler().fit(X_train[['Fare']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['Fare_minmax'] = mms.transform(X_train_copy[['Fare']])
print(X_train_copy.head(6))

     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_minmax
132         0       3  female  47.0      1  14.5000     0.030244
97          1       1    male  23.0      0  63.3583     0.220910
22          1       3  female  15.0      0   8.0292     0.004992
80          0       3    male  22.0      0   9.0000     0.008780
101         0       3    male   NaN      0   7.8958     0.004471
102         0       1    male  21.0      0  77.2875     0.275268


In [40]:
print(X_train_copy['Fare_minmax'].max())
print(X_train_copy['Fare_minmax'].min())

1.0
0.0


Робустное шкалирование

In [41]:
rs = RobustScaler().fit(X_train[['Fare']])
X_train_copy = X_train.copy(deep=True)
X_train_copy['Fare_robust'] = rs.transform(X_train_copy[['Fare']])
print(X_train_copy.head(6))

     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_robust
132         0       3  female  47.0      1  14.5000     0.002050
97          1       1    male  23.0      0  63.3583     2.291278
22          1       3  female  15.0      0   8.0292    -0.301136
80          0       3    male  22.0      0   9.0000    -0.255649
101         0       3    male   NaN      0   7.8958    -0.307386
102         0       1    male  21.0      0  77.2875     2.943922


## Энкодинг переменных 

One-Hot encoding

In [42]:
data1 = pd.get_dummies(data,drop_first=True)

In [43]:
data1.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Fare,Sex_male
0,0,3,22.0,1,7.25,1
1,1,1,38.0,1,71.2833,0
2,1,3,26.0,0,7.925,0
3,1,1,35.0,1,53.1,0
4,0,3,35.0,0,8.05,1


Энкодинг значением

In [44]:
ord_enc = ce.OrdinalEncoder(cols=['Sex']).fit(X_train,y_train)

In [45]:
data4 = ord_enc.transform(data)
print(data4.head(5))

   Survived  Pclass  Sex   Age  SibSp     Fare
0         0       3    2  22.0      1   7.2500
1         1       1    1  38.0      1  71.2833
2         1       3    1  26.0      0   7.9250
3         1       1    1  35.0      1  53.1000
4         0       3    2  35.0      0   8.0500


Таргет энкодинг

In [46]:
target_enc = ce.TargetEncoder(cols=['Sex']).fit(X_train,y_train)

In [47]:
data2 = target_enc.transform(data)
data2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,0.164557,22.0,1,7.25
1,1,1,0.755556,38.0,1,71.2833
2,1,3,0.755556,26.0,0,7.925
3,1,1,0.755556,35.0,1,53.1
4,0,3,0.164557,35.0,0,8.05


WOE энкодинг

In [48]:
woe_enc = ce.WOEEncoder(cols=['Sex']).fit(X_train,y_train)

In [49]:
data3 = woe_enc.transform(data)
data3.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,-1.088008,22.0,1,7.25
1,1,1,1.548069,38.0,1,71.2833
2,1,3,1.548069,26.0,0,7.925
3,1,1,1.548069,35.0,1,53.1
4,0,3,-1.088008,35.0,0,8.05


## Feature Selection

In [50]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))

In [51]:
data.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [52]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), 
                                                    data.target, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((455, 30), (114, 30))

Константные значения

In [53]:
def constant_feature_detect(data,threshold=0.98):    
    data_copy = data.copy(deep=True)
    quasi_constant_feature = []
    for feature in data_copy.columns:
        predominant = (data_copy[feature].value_counts() / np.float(
                      len(data_copy))).sort_values(ascending=False).values[0]
        if predominant >= threshold:
            quasi_constant_feature.append(feature)
    print(len(quasi_constant_feature),'константные переменные')    
    return quasi_constant_feature

In [54]:
quasi_constant_feature = constant_feature_detect(data=X_train,threshold=0.9)

0 константные переменные


In [55]:
X_train['dummy'] = np.floor(X_train['worst smoothness']*10)
X_train.dummy.value_counts() / np.float(len(X_train))

1.0    0.923077
0.0    0.068132
2.0    0.008791
Name: dummy, dtype: float64

In [56]:
quasi_constant_feature = constant_feature_detect(data=X_train,threshold=0.9)
quasi_constant_feature

1 константные переменные


['dummy']

In [57]:
X_train.drop(labels=quasi_constant_feature,axis=1,inplace=True)
print(X_train.shape)

(455, 30)


Корреляционная фильтрация

In [58]:
def corr_feature_detect(data,threshold=0.8):
    
    corrmat = data.corr()
    corrmat = corrmat.abs().unstack() 
    corrmat = corrmat.sort_values(ascending=False)
    corrmat = corrmat[corrmat >= threshold]
    corrmat = corrmat[corrmat < 1] 
    corrmat = pd.DataFrame(corrmat).reset_index()
    corrmat.columns = ['feature1', 'feature2', 'corr']
   
    grouped_feature_ls = []
    correlated_groups = []
    
    for feature in corrmat.feature1.unique():
        if feature not in grouped_feature_ls:
    
            correlated_block = corrmat[corrmat.feature1 == feature]
            grouped_feature_ls = grouped_feature_ls + list(
                correlated_block.feature2.unique()) + [feature]
    
            correlated_groups.append(correlated_block)
    return correlated_groups

In [59]:
corr = corr_feature_detect(data=X_train,threshold=0.9)
for i in corr:
    print(i,'\n')

          feature1         feature2      corr
0   mean perimeter      mean radius  0.998185
6   mean perimeter        mean area  0.986692
14  mean perimeter  worst perimeter  0.970507
19  mean perimeter     worst radius  0.969520
33  mean perimeter       worst area  0.941920 

           feature1      feature2      corr
12  perimeter error  radius error  0.978323
30  perimeter error    area error  0.944995 

          feature1             feature2      corr
36  mean concavity  mean concave points  0.914627 

        feature1       feature2      corr
38  mean texture  worst texture  0.908182 

                feature1             feature2      corr
40  worst concave points  mean concave points  0.906312 



Взаимная информация

In [60]:
def mutual_info(X,y,select_k=10):

    if select_k >= 1:
        sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
        col = X.columns[sel_.get_support()]
        
    elif 0 < select_k < 1:
        sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
        col = X.columns[sel_.get_support()]   
        
    else:
        raise ValueError("select_k должно быть положительным значением")
    
    return col

In [61]:
mi = mutual_info(X=X_train,y=y_train,select_k=3)
print(mi)

Index(['mean concave points', 'worst perimeter', 'worst area'], dtype='object')


In [62]:
mi = mutual_info(X=X_train,y=y_train,select_k=0.2)
print(mi)

Index(['mean perimeter', 'mean concave points', 'worst radius',
       'worst perimeter', 'worst area', 'worst concave points'],
      dtype='object')


Хи-квадрат тест

In [63]:
def chi_square_test(X,y,select_k=10):

    if select_k >= 1:
        sel_ = SelectKBest(chi2, k=select_k).fit(X,y)
        col = X.columns[sel_.get_support()]
    elif 0 < select_k < 1:
        sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y)
        col = X.columns[sel_.get_support()]   
    else:
        raise ValueError("select_k должно быть положительным значением")  
    
    return col

In [64]:
chi = chi_square_test(X=X_train,y=y_train,select_k=3)
print(chi)

Index(['mean area', 'area error', 'worst area'], dtype='object')


In [65]:
chi = chi_square_test(X=X_train,y=y_train,select_k=0.2)
print(chi)

Index(['mean perimeter', 'mean area', 'area error', 'worst radius',
       'worst perimeter', 'worst area'],
      dtype='object')


Одномерный ROC-AUC или MSE анализ

In [66]:
def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold):

    roc_values = []
    for feature in X_train.columns:
        clf = DecisionTreeClassifier()
        clf.fit(X_train[feature].to_frame(), y_train)
        y_scored = clf.predict_proba(X_test[feature].to_frame())
        roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
    roc_values = pd.Series(roc_values)
    roc_values.index = X_train.columns
    print(roc_values.sort_values(ascending=False))
    print(len(roc_values[roc_values > threshold]), len(X_train.columns))
    keep_col = roc_values[roc_values > threshold]
    return keep_col

In [67]:
uni_roc_auc = univariate_roc_auc(X_train=X_train,y_train=y_train,
                                   X_test=X_test,y_test=y_test,threshold=0.8)
print(uni_roc_auc)

worst perimeter            0.917275
worst area                 0.895840
worst radius               0.893458
worst concave points       0.863131
mean concavity             0.856939
mean radius                0.849000
mean area                  0.839314
worst concavity            0.831375
mean perimeter             0.829628
mean concave points        0.826453
area error                 0.812321
worst compactness          0.742299
radius error               0.740235
mean compactness           0.734360
perimeter error            0.680534
worst texture              0.647666
worst fractal dimension    0.640997
concavity error            0.640203
worst symmetry             0.620991
concave points error       0.618133
compactness error          0.607336
mean symmetry              0.591775
mean texture               0.573357
texture error              0.568593
worst smoothness           0.565100
mean smoothness            0.557637
fractal dimension error    0.542077
smoothness error           0

In [68]:
def univariate_mse(X_train,y_train,X_test,y_test,threshold):

    mse_values = []
    for feature in X_train.columns:
        clf = DecisionTreeRegressor()
        clf.fit(X_train[feature].to_frame(), y_train)
        y_scored = clf.predict(X_test[feature].to_frame())
        mse_values.append(mean_squared_error(y_test, y_scored))
    mse_values = pd.Series(mse_values)
    mse_values.index = X_train.columns
    print(mse_values.sort_values(ascending=False))
    print(len(mse_values[mse_values > threshold]), len(X_train.columns))
    keep_col = mse_values[mse_values > threshold]
    return keep_col   

In [69]:
uni_mse = univariate_mse(X_train=X_train,y_train=y_train,
                            X_test=X_test,y_test=y_test,threshold=0.4)
print(uni_mse)

mean fractal dimension     0.491228
symmetry error             0.480750
fractal dimension error    0.456140
smoothness error           0.449561
texture error              0.412281
worst smoothness           0.403265
mean smoothness            0.399123
mean texture               0.396930
mean symmetry              0.363060
compactness error          0.361842
concave points error       0.357456
worst fractal dimension    0.355263
worst symmetry             0.350877
worst texture              0.333333
concavity error            0.333333
perimeter error            0.300439
mean compactness           0.258772
worst compactness          0.254386
radius error               0.245614
area error                 0.179825
mean perimeter             0.166667
mean concave points        0.166667
worst concavity            0.162281
mean radius                0.146930
mean concavity             0.142544
mean area                  0.140351
worst concave points       0.123782
worst area                 0

## Oversampling/undersampling 

In [70]:
len(y_train)

455

In [71]:
sm = SMOTE(random_state=42)

In [72]:
X_res, y_res = sm.fit_resample(X_train, y_train)

In [73]:
len(y_res)

580

In [74]:
cn = CondensedNearestNeighbour(random_state=42)

In [75]:
X_res, y_res = cn.fit_resample(X_train, y_train)

In [76]:
len(y_res)

205