### Строим логистическую регрессию - угадываем пол спортсмена по признакам

https://www.kaggle.com/rio2016/olympic-games

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv( 'athletes.csv' )
data.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11538 entries, 0 to 11537
Data columns (total 11 columns):
id             11538 non-null int64
name           11538 non-null object
nationality    11538 non-null object
sex            11538 non-null object
dob            11537 non-null object
height         11208 non-null float64
weight         10879 non-null float64
sport          11538 non-null object
gold           11538 non-null int64
silver         11538 non-null int64
bronze         11538 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 991.6+ KB


#### Попробуем угадать пол на основе роста, веса и вида спорта

Посмотрим много ли в наших признаках пустых значений

In [3]:
data[ pd.isnull( data['height'] ) ].head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
12,258556239,Abbas Qali,IOA,male,10/11/92,,,aquatics,0,0,0
47,469953606,Abdoullah Bamoussa,ITA,male,6/8/86,,,athletics,0,0,0
50,325809293,Abdul Omar,GHA,male,10/3/93,,,boxing,0,0,0
52,262868423,Abdulaziz Alshatti,IOA,male,10/30/90,,,fencing,0,0,0
56,897549624,Abdullah Hel Baki,BAN,male,8/1/89,,,shooting,0,0,0


In [5]:
print('Для height пустых строк {}'.format( len( data[ pd.isnull( data['height'] ) ] ) ))
print('Для weight пустых строк {}'.format( len( data[ pd.isnull( data['weight'] ) ] ) ))
print('Для sport пустых строк {}'.format( len( data[ pd.isnull( data['sport'] ) ] ) ))
print('Всего строк в наборе {}'.format( len( data ) ))

Для height пустых строк 330
Для weight пустых строк 659
Для sport пустых строк 0
Всего строк в наборе 11538


In [7]:
data['height'].unique()

array([1.72, 1.68, 1.98, 1.83, 1.81, 1.8 , 2.05, 1.93, 1.65, 1.7 , 1.75,
        nan, 1.61, 1.78, 1.76, 2.1 , 1.73, 1.85, 1.77, 1.9 , 1.86, 1.74,
       1.6 , 2.07, 1.88, 1.66, 1.62, 1.87, 2.03, 1.69, 1.82, 1.89, 1.94,
       1.95, 1.71, 1.84, 1.91, 1.67, 2.02, 1.58, 1.63, 1.79, 1.97, 1.56,
       1.55, 1.57, 1.46, 1.92, 1.64, 1.53, 1.99, 1.96, 2.  , 2.04, 1.47,
       1.52, 2.01, 1.51, 1.59, 2.08, 1.37, 1.5 , 1.45, 2.06, 1.54, 2.11,
       1.43, 1.49, 1.33, 1.48, 1.44, 2.13, 2.09, 2.21, 2.18, 1.21, 1.38,
       1.34, 2.15, 2.17, 1.42, 1.4 , 2.14])

In [8]:
# обнуляем ячейки с отсутствием данных
data = data[ pd.isnull( data['height'] ) == 0 ]
data = data[ pd.isnull( data['weight'] ) == 0 ]

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10858 entries, 0 to 11537
Data columns (total 11 columns):
id             10858 non-null int64
name           10858 non-null object
nationality    10858 non-null object
sex            10858 non-null object
dob            10858 non-null object
height         10858 non-null float64
weight         10858 non-null float64
sport          10858 non-null object
gold           10858 non-null int64
silver         10858 non-null int64
bronze         10858 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 1017.9+ KB


In [10]:
from sklearn.linear_model import LogisticRegression

In [12]:
# попробуем выбрать какие-нибудь признаки
selectedColumns = data[ [ 'height', 'sport', 'weight', 'sex' ] ]

# столбец sport является категориальной переменной
# переведем ее в значения 0 и 1, добавив столбцы с соответствующими названиями
X = pd.get_dummies( selectedColumns, columns = [ 'sport' ] )

# столбец sex является целевой переменной, удаляем его из X
del X['sex']
X.head()

Unnamed: 0,height,weight,sport_aquatics,sport_archery,sport_athletics,sport_badminton,sport_basketball,sport_canoe,sport_cycling,sport_equestrian,...,sport_rugby sevens,sport_sailing,sport_shooting,sport_table tennis,sport_taekwondo,sport_tennis,sport_triathlon,sport_volleyball,sport_weightlifting,sport_wrestling
0,1.72,64.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.68,56.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.98,79.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.83,80.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.81,71.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# целевая переменная (столбец sex) снова является категориальной
# переведем значения столбца в числа, оставив один столбец

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [14]:
le.fit( data['sex'] )

LabelEncoder()

In [15]:
le.classes_

array(['female', 'male'], dtype=object)

In [16]:
# пример "расшировки" столбца sex

le.transform( [ 'male', 'female', 'male' ] )

array([1, 0, 1])

In [17]:
# записываем в переменную y преобразованный столбец sex

y = pd.Series( data = le.transform( data['sex'] ) )
y.head()

0    1
1    0
2    1
3    1
4    1
dtype: int64

In [19]:
model = LogisticRegression()

In [20]:
# обучаем модель

model.fit( X, y )
predictions = model.predict_proba( X )



In [21]:
predictions[:5]

array([[0.55159553, 0.44840447],
       [0.88639828, 0.11360172],
       [0.0511122 , 0.9488878 ],
       [0.16373912, 0.83626088],
       [0.19629001, 0.80370999]])

In [24]:
# сравниваем факт с предсказаниями
# (да, сравнивать предсказания с самими данными не круто, лучше было разбить на обучающую и тестовую выборку)

for t in zip( predictions[:, 1], y ):
    print(t)

(0.44840447301439834, 1)
(0.11360171820430219, 0)
(0.9488878007147108, 1)
(0.8362608847233862, 1)
(0.8037099884268356, 1)
(0.7403861088305197, 1)
(0.944861312434139, 1)
(0.9782963600043338, 1)
(0.5213871518887905, 0)
(0.16069978477712754, 0)
(0.3936043709406274, 1)
(0.5451541645202024, 1)
(0.08264513868849951, 0)
(0.4108410405781959, 0)
(0.3666567545934491, 0)
(0.25174789481427773, 1)
(0.4718234081355465, 1)
(0.5725360930476742, 0)
(0.8945543089171308, 1)
(0.2985901370247487, 1)
(0.9018336525313799, 1)
(0.5507400291091301, 1)
(0.8349374821768653, 1)
(0.8477711242258394, 1)
(0.5701266852982113, 1)
(0.5573109451815865, 1)
(0.8348029580234873, 1)
(0.7641686354912572, 1)
(0.8834061555450411, 1)
(0.6396546363193424, 1)
(0.9422285975622462, 1)
(0.9980117439620466, 1)
(0.24213826657227272, 1)
(0.7135046916253567, 1)
(0.4204100023881466, 1)
(0.9519208388739957, 1)
(0.5856993927500936, 1)
(0.3211104247961725, 1)
(0.3251483135436404, 1)
(0.16069978477712754, 1)
(0.20505325895503979, 1)
(0.931504

(0.8042049232847165, 1)
(0.6001132685419858, 1)
(0.49002297032166303, 1)
(0.9264287157443328, 1)
(0.7739132629160818, 1)
(0.18172139920593672, 1)
(0.9846132626845847, 1)
(0.9782052241963863, 1)
(0.9685128801829663, 1)
(0.8930477303444826, 1)
(0.6873999409471612, 1)
(0.47176153892217626, 1)
(0.9178338039532207, 1)
(0.2431947598518303, 1)
(0.9366010179615473, 1)
(0.8066751046854407, 1)
(0.5589806002944133, 1)
(0.8507179976124661, 1)
(0.8043626596992032, 1)
(0.9600069123051946, 1)
(0.6169076151196616, 1)
(0.566070989397015, 1)
(0.7987456491983974, 1)
(0.9789152267176768, 1)
(0.9988525552722098, 1)
(0.8140610721545196, 1)
(0.6599763478717413, 1)
(0.7327107990694517, 1)
(0.9176752045507129, 1)
(0.6135674539772331, 1)
(0.6395434238162866, 1)
(0.9409887431745882, 1)
(0.8484544934701806, 1)
(0.7414895660941656, 1)
(0.9405983415285835, 1)
(0.9276184906824653, 1)
(0.5339496667752809, 1)
(0.5936681848904161, 1)
(0.8700685693355502, 1)
(0.08264513868849951, 0)
(0.5715350199967731, 1)
(0.0352280161

(0.98248913510563, 1)
(0.08298892132617501, 0)
(0.32604622825492713, 0)
(0.32603618884958546, 0)
(0.2039129178087555, 0)
(0.7396146954422397, 1)
(0.9170956754143124, 1)
(0.4983270365798092, 1)
(0.5318959357408172, 1)
(0.9259872469160658, 1)
(0.9409862770829296, 1)
(0.7485638912724926, 1)
(0.26886658755857, 0)
(0.3734201300393676, 0)
(0.11317338087885855, 0)
(0.2376336126550889, 0)
(0.3042917322186633, 0)
(0.9469107392799971, 1)
(0.8453407352327443, 1)
(0.2858446172681744, 1)
(0.5282740232522092, 1)
(0.2939453547288222, 1)
(0.4098626669404079, 1)
(0.9767206948685829, 1)
(0.8233967199201533, 1)
(0.09004471341536602, 0)
(0.9787349952967228, 1)
(0.9533065601104501, 1)
(0.22367031885879404, 1)
(0.01573856378552691, 0)
(0.4714555055230467, 1)
(0.4661959799870272, 0)
(0.7924586546996142, 1)
(0.43063811311453326, 1)
(0.24539695778072873, 0)
(0.3958366369022059, 1)
(0.3225781152492735, 0)
(0.25687299885571296, 1)
(0.6148727644704906, 1)
(0.5075680297899711, 1)
(0.35092061864836627, 0)
(0.491459

(0.3310209073477129, 0)
(0.7039342160168018, 0)
(0.4208252517743267, 0)
(0.7586376357751379, 1)
(0.3697139158571663, 1)
(0.6988581959997515, 1)
(0.9719594353607298, 1)
(0.30388687744167603, 0)
(0.6875430662644899, 1)
(0.33630495617168765, 0)
(0.8751844355776036, 1)
(0.03830214259730398, 0)
(0.5368206636714022, 0)
(0.8066751046854407, 1)
(0.9879806442255055, 1)
(0.8457545602907284, 1)
(0.11703498738021866, 1)
(0.9296987531158224, 1)
(0.5742397038767547, 1)
(0.6424794319324543, 1)
(0.7109886337966053, 1)
(0.7490787261590753, 1)
(0.05402235200350447, 0)
(0.09194290091767883, 0)
(0.05591335688859477, 0)
(0.7150742704618058, 1)
(0.9989034012080781, 1)
(0.9848689651724555, 1)
(0.4304456530416553, 1)
(0.22443973686379884, 0)
(0.7308353154569389, 1)
(0.7790052469599309, 1)
(0.9986978257149816, 1)
(0.011583291984014425, 0)
(0.6265098600827403, 0)
(0.783805614539463, 1)
(0.31110469337628804, 1)
(0.411454686525612, 0)
(0.8491921795640865, 1)
(0.6460412119847798, 0)
(0.8700311114533406, 0)
(0.3412

(0.3710545120529384, 1)
(0.8475509253727653, 1)
(0.669795172289429, 1)
(0.19976957651667446, 0)
(0.4965713505286173, 1)
(0.27519237189691165, 1)
(0.13423897698826295, 0)
(0.801348119523694, 1)
(0.7026420111432133, 1)
(0.09767061090858026, 0)
(0.5961321636188283, 1)
(0.1615889908216806, 1)
(0.2011267825967654, 0)
(0.33778498802547763, 0)
(0.298687935639584, 0)
(0.7327107990694517, 1)
(0.9403882883776591, 1)
(0.4543153248420578, 0)
(0.07220520293850775, 0)
(0.1612458994504531, 1)
(0.22876036081200232, 0)
(0.4013500395457164, 0)
(0.5578467807322223, 0)
(0.9399653480044934, 1)
(0.3214723089118646, 1)
(0.20697855220557443, 0)
(0.4892311849622233, 0)
(0.5111337025156116, 0)
(0.3467436773865617, 0)
(0.09623881245990408, 1)
(0.10797394882017713, 0)
(0.6239197609351809, 0)
(0.628082017139666, 1)
(0.050415070860014226, 0)
(0.2512707458117258, 0)
(0.744747369198015, 1)
(0.1854159594848917, 0)
(0.5111337025156116, 1)
(0.5452374570551457, 0)
(0.29555582341619585, 1)
(0.7730447433049663, 0)
(0.05656

In [25]:
model.score(X, y)

0.817277583348683

In [26]:
##

In [27]:
data.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0


In [28]:
data.columns

Index(['id', 'name', 'nationality', 'sex', 'dob', 'height', 'weight', 'sport',
       'gold', 'silver', 'bronze'],
      dtype='object')

In [29]:
df = data[['nationality', 'sex', 'height', 'weight', 'sport', 'gold', 'silver', 'bronze']]

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10858 entries, 0 to 11537
Data columns (total 8 columns):
nationality    10858 non-null object
sex            10858 non-null object
height         10858 non-null float64
weight         10858 non-null float64
sport          10858 non-null object
gold           10858 non-null int64
silver         10858 non-null int64
bronze         10858 non-null int64
dtypes: float64(2), int64(3), object(3)
memory usage: 763.5+ KB


In [32]:
X, y = data[['nationality', 'height', 'weight', 'sport', 'gold', 'silver', 'bronze']], data['sex']

In [33]:
X.head()

Unnamed: 0,nationality,height,weight,sport,gold,silver,bronze
0,ESP,1.72,64.0,athletics,0,0,0
1,KOR,1.68,56.0,fencing,0,0,0
2,CAN,1.98,79.0,athletics,0,0,1
3,MDA,1.83,80.0,taekwondo,0,0,0
4,NZL,1.81,71.0,cycling,0,0,0


In [34]:
X = pd.get_dummies(X, columns=['nationality', 'sport'])

In [35]:
X.head()

Unnamed: 0,height,weight,gold,silver,bronze,nationality_AFG,nationality_ALB,nationality_ALG,nationality_AND,nationality_ANG,...,sport_rugby sevens,sport_sailing,sport_shooting,sport_table tennis,sport_taekwondo,sport_tennis,sport_triathlon,sport_volleyball,sport_weightlifting,sport_wrestling
0,1.72,64.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.68,56.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.98,79.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.83,80.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.81,71.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [38]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [39]:
lr = LogisticRegression()

In [40]:
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
lr.score(X_train, y_train)

0.8469951646327424

In [43]:
lr.score(X_test, y_test)

0.8259668508287292