In [3]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [4]:
import numpy as np
import matplotlib.pyplot as plt

In [5]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [6]:
dataset = pd.read_csv('train.csv')

In [7]:
len(dataset)

891

In [8]:
len(dataset['Survived'].isin([0,1]))

891

In [9]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
y_train = dataset['Survived'].values

In [11]:
len(y_train)

891

In [12]:
del dataset['Survived']

In [13]:
# del dataset['PassengerId']

# del dataset['Ticket']

# del  dataset['Name']

In [14]:
dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
x_train = dataset.iloc[:,:]

In [16]:
def differentiate_features(x_train):
    discrete_features = []
    continuous_features = []
    for i,j in zip(x_train,x_train.dtypes):
        if j.type==np.int64 or j.type==np.float64:
            continuous_features.append(i)
        else:
            discrete_features.append(i)
    return discrete_features,continuous_features

In [17]:
discrete_features,continuous_features = differentiate_features(x_train)

In [18]:
def get_imputer(x_continuous):
    imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)
    imputer.fit(x_continuous)
    return imputer

In [19]:
x_continuous = dataset.filter(continuous_features).values
imputer = get_imputer(x_continuous)

In [20]:
def get_data_from_imputer(imputer,x_continuous):
    x_continuous = imputer.transform(x_continuous)
    return x_continuous

In [21]:
x_continuous = get_data_from_imputer(imputer,x_continuous)

In [22]:
x_continuous.shape

(891, 6)

In [23]:
def get_label_encoder(discrete_features,x_discrete):
    label_encoder_dict={}
    for col in discrete_features:
        label_encoder_dict[col] = LabelEncoder()
        x_discrete[col] = x_discrete[col].fillna('')
        s = x_discrete[col].astype(str).unique()
        if '' not in s: s = np.append(s,[''])
        label_encoder_dict[col].fit(s)
    return label_encoder_dict

In [24]:
def get_discrete_unique_features(discrete_features):
    unique_discrete_features = {}
    for col in discrete_features:
        unique_discrete_features[col] = dataset[col].unique()
    return unique_discrete_features

In [25]:
unique_discrete_features =  get_discrete_unique_features(discrete_features)

In [26]:
x_discrete = dataset.filter(discrete_features)

In [27]:
label_encoder_dict = get_label_encoder(discrete_features,x_discrete)

In [28]:
def get_discrete_encoder(label_encoder_dict,x_discrete):
    for col in label_encoder_dict:
        x_discrete[col] = label_encoder_dict[col].transform(x_discrete[col])
    return x_discrete

In [29]:
x_discrete = get_discrete_encoder(label_encoder_dict,x_discrete)

In [30]:
try:
    n_discrete = x_discrete.shape[1]
except:
    n_discrete = 0
n_discrete

5

In [31]:
def generate_onehot_encoder(n_discrete):
    onehotencoder = OneHotEncoder(categorical_features = list(range(n_discrete)))
    onehotencoder.fit(x_discrete)
    return onehotencoder

In [32]:
onehotencoder = generate_onehot_encoder(n_discrete)

In [33]:
def get_onehot_encoder(onehotencoder,x_continuous,x_discrete):
    x_discrete = onehotencoder.transform(x_discrete).toarray()
    x = np.concatenate([x_continuous,x_discrete],axis=1)
    return x

In [34]:
x = get_onehot_encoder(onehotencoder,x_continuous,x_discrete)

In [35]:
def get_standardscaler(x):
    sc_X = StandardScaler()
    sc_X.fit(x)
#     sc_y = StandardScaler()
#     sc_y.fit(y.reshape(-1,1))
    return sc_X

In [36]:
sc_X = get_standardscaler(x)

In [37]:
def get_normalized(sc_X,x):
    x = sc_X.transform(x)
#     y = sc_Y.transform(y.reshape(-1,1))
    return x

In [38]:
x_train = get_normalized(sc_X,x)

In [39]:
x_train = np.concatenate([np.ones((len(x_train),1)),x_train],axis=1)

In [40]:
num_features = x_train.shape[1]
num_features

1733

In [41]:
#root mean square error
def rmse(preds,actual):
    return ((preds-actual)**2).sum()/(2*len(preds))

In [42]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [43]:
def loss_fn(preds,actual,weight):
    m = len(preds)
    return np.sum(-(actual*np.log(preds)+(1-actual)*np.log(1-preds)))/m

In [44]:
# def optimize(y,y_train,x_train,lr,rp,weight):
# #     weight[0] = weight[0]-lr*np.matmul((y-y_train),x_train[:,0])/len(x_train)
# #     weight[1:] = (weight[1:] - lr*np.matmul((y-y_train),x_train[:,1:]))/len(x_train)
#     weight = weight - lr*(np.matmul((y-y_train),x_train)/len(x_train))
#     return weight

In [45]:
# #cost function with regularization
# def loss_fn(preds,actual,weight):
#     return ((preds-actual)**2).sum()/(2*len(preds)) + (rp*np.square(weight).sum())

In [46]:
# optimize function - derivative with regularization
def optimize(y,y_train,x_train,lr,rp,weight):
    weight[0] = weight[0]-lr*np.matmul((y-y_train),x_train[:,0])/len(x_train)
    weight[1:] = (weight[1:]*(1-(lr*rp/len(x_train)))) - (lr*np.matmul((y-y_train),x_train[:,1:]))/len(x_train)
    return weight

In [47]:
epochs = 100
w = np.random.uniform(low=-0.1,high=0.1,size=(num_features))
# w = np.random.randn((num_features))*np.sqrt(2/(1+num_features))
lr = 1e-1
rp = 1e-5

In [48]:
loss=0
for epoch in range(epochs):
    pred_y = sigmoid(np.matmul(x_train,w))
    loss = loss_fn(pred_y,y_train,w)
    print('Epoch ',epoch,':-',loss)
    w = optimize(pred_y,y_train,x_train,lr,rp,w)

Epoch  0 :- 1.269688832449102
Epoch  1 :- 1.164047291241201
Epoch  2 :- 1.067502357642958
Epoch  3 :- 0.979563738534938
Epoch  4 :- 0.8996686886084024
Epoch  5 :- 0.8272052538582472
Epoch  6 :- 0.761542619930004
Epoch  7 :- 0.7020600925709689
Epoch  8 :- 0.6481687003276418
Epoch  9 :- 0.5993234813930769
Epoch  10 :- 0.5550277619352644
Epoch  11 :- 0.5148324416848654
Epoch  12 :- 0.4783332347865688
Epoch  13 :- 0.4451674829973344
Epoch  14 :- 0.415010797869826
Epoch  15 :- 0.387573303940135
Epoch  16 :- 0.36259551622450054
Epoch  17 :- 0.3398442378902263
Epoch  18 :- 0.31910891867320995
Epoch  19 :- 0.3001987121646681
Epoch  20 :- 0.28294022160149573
Epoch  21 :- 0.26717577701391565
Epoch  22 :- 0.2527620613369391
Epoch  23 :- 0.23956894516604996
Epoch  24 :- 0.2274784440268772
Epoch  25 :- 0.21638375293265202
Epoch  26 :- 0.2061883378527932
Epoch  27 :- 0.19680507708761447
Epoch  28 :- 0.1881554516775696
Epoch  29 :- 0.18016878571593545
Epoch  30 :- 0.17278153689322287
Epoch  31 :- 0.1

In [49]:
x_train.shape

(891, 1733)

In [50]:
type(w)

numpy.ndarray

In [51]:
pred_y = sigmoid(np.matmul(x_train,w))

In [52]:
len(y_train[y_train==1]),len(y_train[y_train==0])

(342, 549)

In [53]:
y_train.shape

(891,)

In [54]:
tp=0
fn=0
tn=0
fp=0
for a,b in zip(pred_y,y_train):
    if a>=0.5 and b==1:
        tp+=1
    elif a>=0.5 and b==0:
        fp+=1
    elif a<0.5 and b==0:
        tn+=1
    elif a<0.5 and b==1:
        fn+=1
tp,tn,fp,fn

(342, 549, 0, 0)

In [55]:
print('***Survived***')
print('Precision:',tp/(tp+fp))
print('Recall:',tp/(tp+fn))


print('***Not Survived***')
print('Precision:',tn/(tn+fn))
print('Recall:',tn/(tn+fp))

print('\nAccuracy:-',(tp+tn)/(tp+tn+fp+fn))

***Survived***
Precision: 1.0
Recall: 1.0
***Not Survived***
Precision: 1.0
Recall: 1.0

Accuracy:- 1.0


In [56]:
(tp+tn)/(tp+tn+fp+fn)

1.0

In [57]:
def replace_discrete_features(x_test_discrete,discrete_features,unique_discrete_features):
    for col in discrete_features:
        x_test_discrete[col] = x_test_discrete[col].apply(lambda x: x if x in unique_discrete_features[col] else '')
    return x_test_discrete

In [58]:
def get_normalized_test_data(x_test,
                             y_test,
                             continuous_features,
                             discrete_features,
                             imputer,
                             label_encoder_dict,
                             onehotencoder,
                             sc_X,
                             unique_discrete_features
                            ):
    x_test_continuous = x_test.filter(continuous_features)
    x_test_discrete = x_test.filter(discrete_features)
    
    x_test_continuous = get_data_from_imputer(imputer,x_test_continuous)
    
    x_test_discrete = x_test_discrete.fillna('')
    x_test_discrete = replace_discrete_features(x_test_discrete,discrete_features,unique_discrete_features)
    
    x_test_discrete = get_discrete_encoder(label_encoder_dict,x_test_discrete)
    
    x = get_onehot_encoder(onehotencoder,x_test_continuous,x_test_discrete)
    
    x_test = get_normalized(sc_X,x)
    
    x_test = np.concatenate([np.ones((len(x_test),1)),x_test],axis=1)
    
    return x_test,y_test
    

In [59]:
x_test = pd.read_csv('test.csv')

In [60]:
y_test = pd.read_csv('gender_submission.csv')

In [61]:
x_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [62]:

# del x_test['PassengerId']

# del x_test['Ticket']

# del  x_test['Name']

In [63]:
y_test = y_test['Survived'].to_numpy()

In [64]:
x_test, y_test = get_normalized_test_data(x_test,
                             y_test,
                             continuous_features,
                             discrete_features,
                             imputer,
                             label_encoder_dict,
                             onehotencoder,
                             sc_X,
                             unique_discrete_features
                            )

In [65]:
pred_y = sigmoid(np.matmul(x_test,w))

In [66]:
tp=0
fn=0
tn=0
fp=0
for a,b in zip(pred_y,y_test):
    if a>=0.5 and b==1:
        tp+=1
    elif a>=0.5 and b==0:
        fp+=1
    elif a<0.5 and b==0:
        tn+=1
    elif a<0.5 and b==1:
        fn+=1
tp,tn,fp,fn

(131, 247, 19, 21)

In [67]:
print('***Survived***')
print('Precision:',tp/(tp+fp))
print('Recall:',tp/(tp+fn))


print('***Not Survived***')
print('Precision:',tn/(tn+fn))
print('Recall:',tn/(tn+fp))

print('\nAccuracy:-',(tp+tn)/(tp+tn+fp+fn))

***Survived***
Precision: 0.8733333333333333
Recall: 0.8618421052631579
***Not Survived***
Precision: 0.9216417910447762
Recall: 0.9285714285714286

Accuracy:- 0.9043062200956937


In [68]:
temp1 = pd.read_csv('gender_submission.csv')
temp = pd.read_csv('gender_submission.csv')

In [69]:
temp.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [70]:
pred_y = sigmoid(np.matmul(x_test,w))

In [71]:
temp['Survived'] = [1 if i>=0.5 else 0 for i in pred_y]

In [72]:
temp.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [73]:
temp.to_csv('predictions.csv',index=False)

In [74]:
temp[(temp1['Survived']==temp['Survived'])]

Unnamed: 0,PassengerId,Survived
0,892,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
10,902,0
