In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [2]:
import torch.nn as nn
import torch

In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [5]:
dataset = pd.read_csv('train.csv')

In [6]:
len(dataset)

891

In [7]:
len(dataset['Survived'].isin([0,1]))

891

In [8]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
y_train = dataset['Survived'].values

In [10]:
len(y_train)

891

In [11]:
del dataset['Survived']

In [12]:
# del dataset['PassengerId']

# del dataset['Ticket']

del  dataset['Name']

In [13]:
x_train = dataset.iloc[:,:]

In [14]:
def differentiate_features(x_train):
    discrete_features = []
    continuous_features = []
    for i,j in zip(x_train,x_train.dtypes):
        if j.type==np.int64 or j.type==np.float64:
            continuous_features.append(i)
        else:
            discrete_features.append(i)
    return discrete_features,continuous_features

In [15]:
discrete_features,continuous_features = differentiate_features(x_train)

In [16]:
def get_imputer(x_continuous):
    imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)
    imputer.fit(x_continuous)
    return imputer

In [17]:
x_continuous = dataset.filter(continuous_features).values
imputer = get_imputer(x_continuous)



In [18]:
def get_data_from_imputer(imputer,x_continuous):
    x_continuous = imputer.transform(x_continuous)
    return x_continuous

In [19]:
x_continuous = get_data_from_imputer(imputer,x_continuous)

In [20]:
x_continuous.shape

(891, 6)

In [21]:
def get_label_encoder(discrete_features,x_discrete):
    label_encoder_dict={}
    for col in discrete_features:
        label_encoder_dict[col] = LabelEncoder()
        x_discrete[col] = x_discrete[col].fillna('')
        s = x_discrete[col].astype(str).unique()
        if '' not in s: s = np.append(s,['']) # like UNK
        label_encoder_dict[col].fit(s)
    return label_encoder_dict

In [22]:
def get_discrete_unique_features(discrete_features):
    unique_discrete_features = {}
    for col in discrete_features:
        unique_discrete_features[col] = dataset[col].unique()
    return unique_discrete_features

In [23]:
unique_discrete_features =  get_discrete_unique_features(discrete_features)

In [24]:
x_discrete = dataset.filter(discrete_features)

In [25]:
label_encoder_dict = get_label_encoder(discrete_features,x_discrete)

In [26]:
def get_discrete_encoder(label_encoder_dict,x_discrete):
    for col in label_encoder_dict:
        x_discrete[col] = label_encoder_dict[col].transform(x_discrete[col])
    return x_discrete

In [27]:
x_discrete = get_discrete_encoder(label_encoder_dict,x_discrete)

In [28]:
try:
    n_discrete = x_discrete.shape[1]
except:
    n_discrete = 0
n_discrete

4

In [29]:
def generate_onehot_encoder(n_discrete):
    onehotencoder = OneHotEncoder(categorical_features = list(range(n_discrete)))
    onehotencoder.fit(x_discrete)
    return onehotencoder

In [30]:
onehotencoder = generate_onehot_encoder(n_discrete)



In [31]:
def get_onehot_encoder(onehotencoder,x_continuous,x_discrete):
    x_discrete = onehotencoder.transform(x_discrete).toarray()
    x = np.concatenate([x_continuous,x_discrete],axis=1)
    return x

In [32]:
x = get_onehot_encoder(onehotencoder,x_continuous,x_discrete)

In [33]:
def get_standardscaler(x):
    sc_X = StandardScaler()
    sc_X.fit(x)
    return sc_X

In [34]:
sc_X = get_standardscaler(x)

In [35]:
def get_normalized(sc_X,x):
    x = sc_X.transform(x)
    return x

In [36]:
x_train = get_normalized(sc_X,x)

In [37]:
x_train = np.concatenate([np.ones((len(x_train),1)),x_train],axis=1)

In [38]:
num_features = x_train.shape[1]
num_features

842

In [39]:
def sigmoid(x):
    return 1/(1+torch.exp(-x))

In [40]:
def loss_fn(preds,actual):
    return (((-actual*(torch.log(preds)))+(-(1-actual)*torch.log(1-preds))).sum()/(2*len(preds))) + rp*torch.pow(w,2).sum().div(2*len(w))
        

In [41]:
len(x_train)

891

In [42]:
epochs = 100
w = np.random.uniform(low=-0.1,high=0.1,size=(num_features))
# w = np.random.randn((num_features))*np.sqrt(2/(1+num_features))
lr = 3e-1
rp = 1e-5

In [43]:
w = nn.Parameter(torch.Tensor(w))
x_trn = torch.Tensor(x_train)
y_trn = torch.Tensor(y_train)

In [44]:
def fit():
    for epoch in range(epochs):
        z = x_trn@w
        y_hat = sigmoid(z)
        loss = loss_fn(y_hat,y_trn)
        if epoch%10==0:
            print(loss.item())
        loss.backward()
        with torch.no_grad():
            w.sub_(lr*w.grad)
            w.grad.zero_()

In [45]:
def predict(data):
    with torch.no_grad():
        return sigmoid(data@w)

In [46]:
def accuracy(y,preds):
    return (y==torch.Tensor([1. if i>=0.5 else 0. for i in preds])).float().mean()

In [47]:
def precison_recall(tp,tn,fp,fn):
    return tp/(tp+fp),tp/(tp+fn),tn/(tn+fn),tn/(tn+fp)

In [48]:
fit()

0.5061836838722229
0.22976742684841156
0.13888731598854065
0.09985268115997314
0.079193115234375
0.06664079427719116
0.05823798105120659
0.05221041664481163
0.0476636104285717
0.04410196468234062


In [49]:
pred_y = predict(x_trn)
tn, fp, fn, tp = confusion_matrix(y_trn,[1. if i>=0.5 else 0. for i in pred_y]).ravel()
label_1p,label_1r,label_0p,label_0r = precison_recall(tp,tn,fp,fn)
print('Label - 1')
print('Precision:-',label_1p)
print('Recall:-',label_1r)
print('Label - 0')
print('Precision:-',label_0p)
print('Recall:-',label_0r)
print('\nAccuracy:-',accuracy(y_trn,pred_y).item())

Label - 1
Precision:- 0.9825581395348837
Recall:- 0.9883040935672515
Label - 0
Precision:- 0.9926873857404022
Recall:- 0.9890710382513661

Accuracy:- 0.988776683807373


In [50]:
def replace_discrete_features(x_test_discrete,discrete_features,unique_discrete_features):
    for col in discrete_features:
        x_test_discrete[col] = x_test_discrete[col].apply(lambda x: x if x in unique_discrete_features[col] else '')
    return x_test_discrete

In [51]:
def get_normalized_test_data(x_test,
                             y_test,
                             continuous_features,
                             discrete_features,
                             imputer,
                             label_encoder_dict,
                             onehotencoder,
                             sc_X,
                             unique_discrete_features
                            ):
    x_test_continuous = x_test.filter(continuous_features)
    x_test_discrete = x_test.filter(discrete_features)
    
    x_test_continuous = get_data_from_imputer(imputer,x_test_continuous)
    
    x_test_discrete = x_test_discrete.fillna('')
    x_test_discrete = replace_discrete_features(x_test_discrete,discrete_features,unique_discrete_features)
    
    x_test_discrete = get_discrete_encoder(label_encoder_dict,x_test_discrete)
    
    x = get_onehot_encoder(onehotencoder,x_test_continuous,x_test_discrete)
    
    x_test = get_normalized(sc_X,x)
    
    x_test = np.concatenate([np.ones((len(x_test),1)),x_test],axis=1)
    
    return x_test,y_test
    

In [52]:
x_test = pd.read_csv('test.csv')

In [53]:
y_test = pd.read_csv('gender_submission.csv')

In [54]:
x_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [55]:

# del x_test['PassengerId']

# del x_test['Ticket']

del  x_test['Name']

In [56]:
y_test = y_test['Survived'].to_numpy()

In [57]:
x_test, y_test = get_normalized_test_data(x_test,
                             y_test,
                             continuous_features,
                             discrete_features,
                             imputer,
                             label_encoder_dict,
                             onehotencoder,
                             sc_X,
                             unique_discrete_features
                            )

In [58]:
x_t = torch.Tensor(x_test)
y_t = torch.Tensor(y_test)

In [59]:
pred_y = predict(x_t)
tn, fp, fn, tp = confusion_matrix(y_t,[1. if i>=0.5 else 0. for i in pred_y]).ravel()
label_1p,label_1r,label_0p,label_0r = precison_recall(tp,tn,fp,fn)
print('Label - 1')
print('Precision:-',label_1p)
print('Recall:-',label_1r)
print('Label - 0')
print('Precision:-',label_0p)
print('Recall:-',label_0r)
print('\nAccuracy:-',accuracy(y_t,pred_y).item())

Label - 1
Precision:- 0.7904191616766467
Recall:- 0.868421052631579
Label - 0
Precision:- 0.9203187250996016
Recall:- 0.868421052631579

Accuracy:- 0.8684210777282715


In [60]:
loss_fn(pred_y,y_t)

tensor(0.1762, grad_fn=<AddBackward0>)

USING SKLEARN - LOGISTIC REGRESSION

In [61]:
from sklearn.linear_model import LogisticRegression

In [62]:
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(x_trn, y_trn)

In [63]:
clf.score(x_trn,y_trn)

0.9910213243546577

In [64]:
clf.score(x_t,y_t)

0.9090909090909091