## TITANIC - логистическая регрессия с пайплайном

    Цель - поработать с tf.data 
    У нас есть csv файл с кучей всего хотим сделать датасет с числовыми и категориальными признакоми 
    (некий минимум). 
    Потом на этом датасете проучить руками логистическую регрессию и посмотреть результат на тесте

In [104]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score


In [105]:
file_path = './data/titanic-train.csv'
df = pd.read_csv(file_path)
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [106]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [107]:
def get_dataset(file_path):
    ds = tf.data.experimental.make_csv_dataset(
        file_path, 
        16, 
        label_name='Survived',
        num_epochs=1,
        select_columns=['Pclass', 'Sex', 'Age', 'Fare', 'Survived']
    )
    return ds

In [108]:
ds = get_dataset(file_path)
# датасет - это сейчас по сути словарь (ordered dict). ключи - имена колонок, значения - массивы значений батча
# датасет - итератор, нельзя batch, label = ds.take(1)

In [109]:
batch, label = next(iter(ds))

print(label.numpy())
# в баче значения по ключу - тензоры (пока одномерные). . 
print(batch['Pclass'].numpy())
print(batch['Sex'].numpy())

[0 0 1 0 0 1 0 1 1 1 0 1 0 0 1 0]
[3 2 2 3 3 2 2 1 1 1 3 2 3 1 1 2]
[b'male' b'male' b'female' b'male' b'male' b'female' b'male' b'male'
 b'male' b'female' b'female' b'male' b'female' b'male' b'male' b'male']


In [110]:
# переусложнять не буду - категориальные и числовые соберу каждое по отдельности, потом упакую
CATEGORIES = {
    'sex': ['male', 'female'],
    'class' : [1, 2, 3],
}

age = tf.feature_column.numeric_column('Age', 
                                        normalizer_fn=lambda x: (x-29.699118)/ 14.526497, 
                                        shape=[1])
fare = tf.feature_column.numeric_column('Fare', 
                                        normalizer_fn=lambda x: (x-32.204208)/ 49.693429, 
                                        shape=[1])
sex = tf.feature_column.categorical_column_with_vocabulary_list('Sex', 
                                        vocabulary_list=CATEGORIES['sex'])
# делаем one-hot
sex = tf.feature_column.indicator_column(sex)

cls = tf.feature_column.categorical_column_with_vocabulary_list('Pclass', 
                                        vocabulary_list=CATEGORIES['class'])
# делаем one-hot
cls = tf.feature_column.indicator_column(cls)

input_layer = tf.keras.layers.DenseFeatures((sex, age, fare, cls))

In [112]:
# layer - это то, через что мы пропускаем батч
print(input_layer(batch).numpy(), label.numpy()) 

[[-2.044479    0.7515238   0.          0.          1.          1.
   0.        ]
 [-2.044479   -0.6480577   0.          1.          0.          1.
   0.        ]
 [-0.87420374 -0.43676215  0.          1.          0.          0.
   1.        ]
 [ 0.8467893  -0.47373885  0.          0.          1.          1.
   0.        ]
 [-0.66768456 -0.47373885  0.          0.          1.          1.
   0.        ]
 [-0.7365243  -0.12484968  0.          1.          0.          0.
   1.        ]
 [-0.32348594 -0.3864537   0.          1.          0.          1.
   0.        ]
 [ 0.36491126 -0.11378183  1.          0.          0.          1.
   0.        ]
 [ 1.2598276   0.39835835  1.          0.          0.          1.
   0.        ]
 [ 0.57143044  0.78640366  1.          0.          0.          0.
   1.        ]
 [-1.9067996  -0.08661525  0.          0.          1.          0.
   1.        ]
 [-2.044479   -0.3864537   0.          1.          0.          1.
   0.        ]
 [-1.4249215   0.04368368  0

## Простая модель на Keras без выкрутасов 
    просто чтобы понять как этот слой работает

In [113]:
model = tf.keras.models.Sequential([
    input_layer,
    tf.keras.layers.Dense(32, activation='relu'), 
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(lr = 0.001),
              metrics = ['accuracy'])

history = model.fit(ds, epochs=2)

Epoch 1/2
Epoch 2/2


In [114]:
file_path = './data/titanic-test.csv'
df = pd.read_csv(file_path)
df.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [13]:
# тест не нужно делать датасетом - нужно просто это все ему как-то скормить

## Ок, теперь просто хочу подготовить данные и сделать руками Л.Р. 

In [115]:
ds = pd.read_csv('data/titanic-train.csv')
ds = ds[['Survived','Pclass', 'Sex','Age','Fare']].dropna()
label = ds['Survived']
data = ds[['Pclass', 'Sex','Age','Fare']]

data.head(2)


Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833


In [116]:
# Pclass, Sex  - кодируем one-hot
pclass_encoder = OneHotEncoder(sparse=False)
pclass = pclass_encoder.fit_transform(data[['Pclass']])

sex_encoder = OneHotEncoder(sparse=False)
sex = sex_encoder.fit_transform(data[['Sex']])

data_one_hot = data.join(pd.DataFrame(pclass, index=data.index, columns=pclass_encoder.get_feature_names())) 
data_one_hot = data_one_hot.join(pd.DataFrame(sex, index=data.index, columns=sex_encoder.get_feature_names()))
data_one_hot.drop(columns=['Pclass', 'Sex'], inplace=True)


# нормализуем числовые признаки
age_mean, fare_mean = data_one_hot[['Age',"Fare"]].mean()
age_std, fare_std = data_one_hot[['Age',"Fare"]].std()

data_one_hot[['Age']] = (data_one_hot[['Age']] - age_mean) / age_std
data_one_hot[['Fare']] = (data_one_hot[['Fare']] - fare_mean) / fare_std

data_one_hot.head()

Unnamed: 0,Age,Fare,x0_1,x0_2,x0_3,x0_female,x0_male
0,-0.530005,-0.518614,0.0,0.0,1.0,0.0,1.0
1,0.57143,0.691412,1.0,0.0,0.0,1.0,0.0
2,-0.254646,-0.505859,0.0,0.0,1.0,1.0,0.0
3,0.364911,0.347805,1.0,0.0,0.0,1.0,0.0
4,0.364911,-0.503497,0.0,0.0,1.0,0.0,1.0


In [117]:
data_one_hot.shape, label.shape

((714, 7), (714,))

## Модель
    Делаем и обучаем регрессию. все по-простому, никаких батчей.
    тупо делаем градиентный спуск

In [227]:
class MyRegression():
    def __init__(self, shape = (1,7)):
        self.w = tf.Variable(np.random.randn(*shape)*0.01, dtype=tf.float32, name='w')
        self.b = tf.Variable(tf.zeros((1,1)),dtype=tf.float32, name='b')
        self.costs = []
        
    def forward_pass_(self, X):
        res = self.w @ tf.transpose(X) + self.b
        res = 1./(1 + tf.math.exp(-res))
        return res
    
    def cost_(self, X, y):
        y_hat = tf.squeeze(self.forward_pass_(X))
        cost = -1./X.shape[0] * tf.reduce_sum(tf.math.log(1.-y_hat)*(1.-y) + tf.math.log(y_hat)*(y))
        return cost, y_hat
    
    def fit(self, X, y, epochs = 10000, lr = 0.005):
        for epoch in range(epochs):
            with tf.GradientTape() as tape:
                # как выяснилось, оно прекрасно справляется с выносом части графа вычислений 
                # в методы
                cost, y_hat = self.cost_(X, y)
            grads = tape.gradient(cost,[self.w, self.b])
            self.w.assign_sub(lr*grads[0])
            self.b.assign_sub(lr*grads[1])
            
            if epoch % 1000 == 0:
                y_hat_rnd = np.round(tf.transpose(y_hat).numpy())
                acc = accuracy_score(y_hat_rnd, y)
                print('epoch: {} cost: {} accuracy: {}'.format(epoch, cost.numpy(), acc))            

    def fit__(self, X, y, epochs = 300, lr = 0.025):
        # сделал этот метод отдельно, потому что были подозрения что gradient.tape 
        # не работает с вызовами функций...
        #
        w = tf.Variable(np.random.randn(1,7)*0.1, dtype=tf.float32, name='wl')
        b = tf.Variable(tf.zeros((1,1)),dtype=tf.float32, name='bl')
        
        for epoch in range(epochs):
            with tf.GradientTape() as tape:
                y_hat = tf.squeeze(1./(1+tf.math.exp(-(w @ tf.transpose(X) + b)))) 
                #print('y.shape:', y.shape, ' y_hat.shape:', y_hat.shape)
                cost = -1/X.shape[0]*tf.reduce_sum(tf.math.log(1.-y_hat)*(1.-y)+tf.math.log(y_hat)*(y))
            grads = tape.gradient(cost, [w, b])
            
            if epoch % 100 == 0:
                y_hat_rnd = np.round(tf.transpose(y_hat).numpy())
                acc = accuracy_score(y_hat_rnd, y)
                print('epoch: {} cost: {} accuracy: {}'.format(epoch, cost.numpy(), acc))
            
            w.assign_sub(lr*grads[0])
            b.assign_sub(lr*grads[1])
        self.w.assign(w)
        self.b.assign(b)
        
        
    def predict(self, X):
        y_hat = 1./(1+tf.exp(-(self.w @ tf.transpose(X) + self.b))) 
        return np.round(tf.transpose(y_hat).numpy()).astype(np.int32)

In [228]:
model =  MyRegression()

In [229]:
model.fit(tf.constant(data_one_hot, dtype=tf.float32), tf.constant(label, dtype=tf.float32))

epoch: 0 cost: 0.6903665661811829 accuracy: 0.6442577030812325
epoch: 1000 cost: 0.5273525714874268 accuracy: 0.7829131652661064
epoch: 2000 cost: 0.4912027418613434 accuracy: 0.7899159663865546
epoch: 3000 cost: 0.47661858797073364 accuracy: 0.788515406162465
epoch: 4000 cost: 0.4693169891834259 accuracy: 0.7871148459383753
epoch: 5000 cost: 0.46505624055862427 accuracy: 0.788515406162465
epoch: 6000 cost: 0.4622699022293091 accuracy: 0.788515406162465
epoch: 7000 cost: 0.4602976143360138 accuracy: 0.7899159663865546
epoch: 8000 cost: 0.4588284492492676 accuracy: 0.7913165266106442
epoch: 9000 cost: 0.45769938826560974 accuracy: 0.7955182072829131


## Готовим тестовый датасет


In [230]:
ds_test = pd.read_csv('data/titanic-test.csv')
ds_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [231]:
ds_test = pd.read_csv('data/titanic-test.csv')
ds_test = ds_test[['PassengerId','Pclass', 'Sex','Age','Fare']].fillna(0)
ds_res =  ds_test[['PassengerId']]
data_test = ds_test[['Pclass', 'Sex','Age','Fare']]

data_test.head(2)

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,34.5,7.8292
1,3,female,47.0,7.0


In [232]:
pclass = pclass_encoder.transform(data_test[['Pclass']])
sex = sex_encoder.transform(data_test[['Sex']])

data_one_hot_test = data_test.join(pd.DataFrame(pclass, 
                                index=data_test.index, columns=pclass_encoder.get_feature_names())) 
data_one_hot_test = data_one_hot_test.join(pd.DataFrame(sex, 
                                index=data_test.index, columns=sex_encoder.get_feature_names()))
data_one_hot_test.drop(columns=['Pclass', 'Sex'], inplace=True)


# нормализуем числовые признаки
data_one_hot_test[['Age']] = (data_one_hot_test[['Age']] - age_mean) / age_std
data_one_hot_test[['Fare']] = (data_one_hot_test[['Fare']] - fare_mean) / fare_std

data_one_hot_test.head()

Unnamed: 0,Age,Fare,x0_1,x0_2,x0_3,x0_female,x0_male
0,0.330491,-0.507669,0.0,0.0,1.0,0.0,1.0
1,1.190988,-0.523339,0.0,0.0,1.0,1.0,0.0
2,2.223584,-0.472553,0.0,1.0,0.0,0.0,1.0
3,-0.185807,-0.491923,0.0,0.0,1.0,0.0,1.0
4,-0.530005,-0.423422,0.0,0.0,1.0,1.0,0.0


In [233]:
y_res = model.predict(tf.constant(data_one_hot_test, dtype=tf.float32))
y_res.shape

(418, 1)

In [234]:
result =  ds_res.join(pd.DataFrame(y_res,index = ds_res.index, columns=['Survived']))
result.to_csv('data/titanic-res.csv', index=False)

In [235]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
