### Строим логистическую регрессию - угадываем пол спортсмена по признакам

https://www.kaggle.com/rio2016/olympic-games

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv( 'athletes.csv' )
data.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11538 entries, 0 to 11537
Data columns (total 11 columns):
id             11538 non-null int64
name           11538 non-null object
nationality    11538 non-null object
sex            11538 non-null object
dob            11537 non-null object
height         11208 non-null float64
weight         10879 non-null float64
sport          11538 non-null object
gold           11538 non-null int64
silver         11538 non-null int64
bronze         11538 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 991.6+ KB


#### Попробуем угадать пол на основе роста, веса и вида спорта

Посмотрим много ли в наших признаках пустых значений

In [4]:
data[ pd.isnull( data['height'] ) ].head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
12,258556239,Abbas Qali,IOA,male,10/11/92,,,aquatics,0,0,0
47,469953606,Abdoullah Bamoussa,ITA,male,6/8/86,,,athletics,0,0,0
50,325809293,Abdul Omar,GHA,male,10/3/93,,,boxing,0,0,0
52,262868423,Abdulaziz Alshatti,IOA,male,10/30/90,,,fencing,0,0,0
56,897549624,Abdullah Hel Baki,BAN,male,8/1/89,,,shooting,0,0,0


In [5]:
print ('Для height пустых строк {}'.format( len( data[ pd.isnull( data['height'] ) ] ) ))
print ('Для weight пустых строк {}'.format( len( data[ pd.isnull( data['weight'] ) ] ) ))
print ('Для sport пустых строк {}'.format( len( data[ pd.isnull( data['sport'] ) ] ) ))
print ('Всего строк в наборе {}'.format( len( data ) ))

Для height пустых строк 330
Для weight пустых строк 659
Для sport пустых строк 0
Всего строк в наборе 11538


In [6]:
data['height'].unique()

array([1.72, 1.68, 1.98, 1.83, 1.81, 1.8 , 2.05, 1.93, 1.65, 1.7 , 1.75,
        nan, 1.61, 1.78, 1.76, 2.1 , 1.73, 1.85, 1.77, 1.9 , 1.86, 1.74,
       1.6 , 2.07, 1.88, 1.66, 1.62, 1.87, 2.03, 1.69, 1.82, 1.89, 1.94,
       1.95, 1.71, 1.84, 1.91, 1.67, 2.02, 1.58, 1.63, 1.79, 1.97, 1.56,
       1.55, 1.57, 1.46, 1.92, 1.64, 1.53, 1.99, 1.96, 2.  , 2.04, 1.47,
       1.52, 2.01, 1.51, 1.59, 2.08, 1.37, 1.5 , 1.45, 2.06, 1.54, 2.11,
       1.43, 1.49, 1.33, 1.48, 1.44, 2.13, 2.09, 2.21, 2.18, 1.21, 1.38,
       1.34, 2.15, 2.17, 1.42, 1.4 , 2.14])

In [7]:
# обнуляем ячейки с отсутствием данных
data = data[ pd.isnull( data['height'] ) == 0 ]
data = data[ pd.isnull( data['weight'] ) == 0 ]

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10858 entries, 0 to 11537
Data columns (total 11 columns):
id             10858 non-null int64
name           10858 non-null object
nationality    10858 non-null object
sex            10858 non-null object
dob            10858 non-null object
height         10858 non-null float64
weight         10858 non-null float64
sport          10858 non-null object
gold           10858 non-null int64
silver         10858 non-null int64
bronze         10858 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 1017.9+ KB


In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
# целевая переменная (столбец sex) снова является категориальной
# переведем значения столбца в числа, оставив один столбец

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [11]:
le.fit( data['sex'] )

LabelEncoder()

In [12]:
le.classes_

array(['female', 'male'], dtype=object)

In [13]:
# пример "расшировки" столбца sex

le.transform( [ 'male', 'female', 'male' ] )

array([1, 0, 1])

In [14]:
# записываем в переменную y преобразованный столбец sex
data['sex'] = le.transform( data['sex'] ) 

In [15]:
data = data[ pd.isnull(data['height'])== 0]
data = data[ pd.isnull(data['height'])== 0]

In [16]:
def get_woe_v1(df_train, col, target_col):
    all_good = len(df_train[df_train[target_col] == 1][col])
    all_bad = len(df_train[df_train[target_col] == 0][col])
    odds_series = (
        df_train[df_train[target_col] == 1][col].value_counts()
        /
        df_train[df_train[target_col] == 0][col].value_counts()
    )
    odds_series = odds_series / all_good * all_bad
    category_woe_dict = np.log(odds_series).to_dict()
    df_train[col + '_woe'] = df_train[col].apply(category_woe_dict.get)
    #df_test[col + '_woe'] = df_test[col].apply(category_woe_dict.get)
    return df_train

In [17]:
data.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
0,736041664,A Jesus Garcia,ESP,1,10/17/69,1.72,64.0,athletics,0,0,0
1,532037425,A Lam Shin,KOR,0,9/23/86,1.68,56.0,fencing,0,0,0
2,435962603,Aaron Brown,CAN,1,5/27/92,1.98,79.0,athletics,0,0,1
3,521041435,Aaron Cook,MDA,1,1/2/91,1.83,80.0,taekwondo,0,0,0
4,33922579,Aaron Gate,NZL,1,11/26/90,1.81,71.0,cycling,0,0,0


In [18]:
col_get_prob = ['nationality', 'dob', 'name', 'sport', 'id']

In [19]:
for col in col_get_prob:
    get_woe_v1(data, col, 'sex')

In [20]:
data.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,nationality_woe,dob_woe,name_woe,sport_woe,id_woe
0,736041664,A Jesus Garcia,ESP,1,10/17/69,1.72,64.0,athletics,0,0,0,-0.030642,,,-0.091239,
1,532037425,A Lam Shin,KOR,0,9/23/86,1.68,56.0,fencing,0,0,0,-0.208643,,,-0.184344,
2,435962603,Aaron Brown,CAN,1,5/27/92,1.98,79.0,athletics,0,0,1,-0.536483,0.245612,,-0.091239,
3,521041435,Aaron Cook,MDA,1,1/2/91,1.83,80.0,taekwondo,0,0,0,0.28198,-0.159853,,-0.143853,
4,33922579,Aaron Gate,NZL,1,11/26/90,1.81,71.0,cycling,0,0,0,-0.179085,,,0.310798,


In [21]:
correct_cols = []
for x in data.columns.tolist():
    if x not in col_get_prob:
        correct_cols.append(x)

In [22]:
data = data[correct_cols]

In [23]:
data = data.fillna(0)

In [25]:
X = data.drop('sex', axis=1)

In [26]:
y = data['sex']

In [27]:
model = LogisticRegression()

In [28]:
# обучаем модель

model.fit( X, y )
predictions = model.predict_proba( X )

In [29]:
predictions[:5]

array([[0.61794728, 0.38205272],
       [0.82427447, 0.17572553],
       [0.20376629, 0.79623371],
       [0.18702571, 0.81297429],
       [0.36465154, 0.63534846]])

In [30]:
# сравниваем факт с предсказаниями
# (да, сравнивать предсказания с самими данными не круто, лучше было разбить на обучающую и тестовую выборку)
tr = 0.5
counter = 0
for line in zip( predictions[:, 1], y ):
    if line[1] == 1 and line[0] > tr:
        counter += 1
    if line[1] == 0 and line[0] < tr:
        counter += 1   
counter/len(y)

0.8004236507644134

In [31]:
model.score(X, y)

0.8004236507644134

In [32]:
X.head()

Unnamed: 0,height,weight,gold,silver,bronze,nationality_woe,dob_woe,name_woe,sport_woe,id_woe
0,1.72,64.0,0,0,0,-0.030642,0.0,0.0,-0.091239,0.0
1,1.68,56.0,0,0,0,-0.208643,0.0,0.0,-0.184344,0.0
2,1.98,79.0,0,0,1,-0.536483,0.245612,0.0,-0.091239,0.0
3,1.83,80.0,0,0,0,0.28198,-0.159853,0.0,-0.143853,0.0
4,1.81,71.0,0,0,0,-0.179085,0.0,0.0,0.310798,0.0


In [33]:
from sklearn.model_selection import train_test_split


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [35]:
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
model.coef_

array([[ 3.81485155,  0.08309778, -0.30743889, -0.33312894, -0.27904592,
         1.03552867,  0.93446244,  0.        ,  0.44346181,  0.        ]])

In [38]:
predictions = model.predict_proba(X_test)

In [41]:
from sklearn.metrics import roc_auc_score

In [43]:
roc_auc_score( y_test, predictions[:, 1] )

0.8800854905720678