## Logistic Regression on Titanic dataset

In [1]:
import numpy as np
from sklearn import linear_model as lm
from sklearn import metrics
import pandas as pd
from sklearn import model_selection as ms

### Data cleaning and manipulation

In [2]:
df=pd.read_csv('titanic_x_y_train.csv')

In [3]:
df=df.drop(['Name','Ticket','Cabin','Embarked'],axis=1)

In [4]:
df=df.dropna()

In [5]:
df=df.replace({'male':1,'female':-1})

In [6]:
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,2,-1,29.0,1,0,26.0000,1
2,2,1,39.0,0,0,26.0000,0
3,3,-1,29.0,0,4,21.0750,0
4,3,1,25.0,0,0,7.0500,0
5,3,1,34.5,0,0,6.4375,0
...,...,...,...,...,...,...,...
660,2,1,1.0,2,1,39.0000,1
662,1,1,47.0,0,0,34.0208,0
663,2,-1,17.0,0,0,10.5000,1
665,3,1,32.0,0,0,56.4958,1


In [7]:
y=df['Survived']
df=df.drop(['Survived'],axis=1)

In [8]:
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,2,-1,29.0,1,0,26.0000
2,2,1,39.0,0,0,26.0000
3,3,-1,29.0,0,4,21.0750
4,3,1,25.0,0,0,7.0500
5,3,1,34.5,0,0,6.4375
...,...,...,...,...,...,...
660,2,1,1.0,2,1,39.0000
662,1,1,47.0,0,0,34.0208
663,2,-1,17.0,0,0,10.5000
665,3,1,32.0,0,0,56.4958


### Composite feature creation for multiple logistic regression

In [9]:
d=df.shape[1]
for i in range(d):
    for j in range(i,d):
        if (df.iloc[:,i]*df.iloc[:,j]).var()==0:
            pass
        else:
            df[str(i)+str(j)]=df.iloc[:,i]*df.iloc[:,j]

In [10]:
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,00,01,02,03,...,22,23,24,25,33,34,35,44,45,55
0,2,-1,29.0,1,0,26.0000,4,-2,58.0,2,...,841.00,29.0,0.0,754.00000,1,0,26.0,0,0.0,676.000000
2,2,1,39.0,0,0,26.0000,4,2,78.0,0,...,1521.00,0.0,0.0,1014.00000,0,0,0.0,0,0.0,676.000000
3,3,-1,29.0,0,4,21.0750,9,-3,87.0,0,...,841.00,0.0,116.0,611.17500,0,0,0.0,16,84.3,444.155625
4,3,1,25.0,0,0,7.0500,9,3,75.0,0,...,625.00,0.0,0.0,176.25000,0,0,0.0,0,0.0,49.702500
5,3,1,34.5,0,0,6.4375,9,3,103.5,0,...,1190.25,0.0,0.0,222.09375,0,0,0.0,0,0.0,41.441406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660,2,1,1.0,2,1,39.0000,4,2,2.0,4,...,1.00,2.0,1.0,39.00000,4,2,78.0,1,39.0,1521.000000
662,1,1,47.0,0,0,34.0208,1,1,47.0,0,...,2209.00,0.0,0.0,1598.97760,0,0,0.0,0,0.0,1157.414833
663,2,-1,17.0,0,0,10.5000,4,-2,34.0,0,...,289.00,0.0,0.0,178.50000,0,0,0.0,0,0.0,110.250000
665,3,1,32.0,0,0,56.4958,9,3,96.0,0,...,1024.00,0.0,0.0,1807.86560,0,0,0.0,0,0.0,3191.775418


In [11]:
x=df.values

### Test Train split

In [12]:
x_train,x_test,y_train,y_test=ms.train_test_split(x,y)

### Standardization of data 

In [13]:
norm=np.zeros([2,x_train.shape[1]])
for i in range(x_train.shape[1]):
    norm[0,i]=x_train[:,i].mean()
    norm[1,i]=np.sqrt(x_train[:,i].var())
    x_train[:,i]=(x_train[:,i]-norm[0,i])/norm[1,i]
x_train

array([[-1.53680125, -1.32416942,  1.60333891, ..., -0.31257505,
        -0.30168284,  0.19248446],
       [-0.31344659, -1.32416942, -1.55171394, ...,  0.90737805,
         0.35883029, -0.16471995],
       [-0.31344659,  0.75519037,  0.48154234, ..., -0.31257505,
        -0.30168284, -0.19888986],
       ...,
       [-1.53680125,  0.75519037,  0.06086863, ..., -0.31257505,
        -0.30168284, -0.15939797],
       [ 0.90990807,  0.75519037, -0.57014194, ..., -0.31257505,
        -0.30168284, -0.20654021],
       [-1.53680125,  0.75519037,  1.25277748, ..., -0.31257505,
        -0.30168284, -0.03233156]])

In [14]:
for i in range(x_test.shape[1]):
    x_test[:,i]=(x_test[:,i]-norm[0,i])/norm[1,i]

### Logistic regression implementation

In [15]:
algo=lm.LogisticRegression(solver='saga',max_iter=5000)

In [16]:
algo.fit(x_train,y_train)

LogisticRegression(max_iter=5000, solver='saga')

In [17]:
y_pred=algo.predict(x_test)

In [18]:
metrics.confusion_matrix(y_test,y_pred)

array([[65, 16],
       [11, 42]], dtype=int64)

In [19]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83        81
           1       0.72      0.79      0.76        53

    accuracy                           0.80       134
   macro avg       0.79      0.80      0.79       134
weighted avg       0.80      0.80      0.80       134

