<a href="https://colab.research.google.com/github/ranggasiphhi/SGD/blob/master/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction
This is an example simple logistic regression for predicting passenger whether survived or not while abroading the Titanic.  It consists of 10 attributes and describes below:


1.   Survival (0 = No, 1 = Yes)
2.   PClass/Ticket Class (1,2,3)
3.   Sex
4.   Age
5.   Sibsp (Number of siblings or spouses abroad the Titanic)
6.   Parch (Number of parents or children abroad the Titanic)
7.   Ticket Number
8.   Passenger Fare
9.   Cabin Number
10.  Port of Embarktion






In [378]:
import numpy as np
import pandas as pd
import io

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

df = pd.read_csv(io.BytesIO(uploaded['train.csv']))
df

Saving train.csv to train (23).csv
User uploaded file "train.csv" with length 61194 bytes


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Data Cleansing and Matrix Created
In this section we choose the attribute that depends on the survival of Titanic.  We choose PClass, Sex, Sibsp, and Parch to train our data.  We also remove the NaN data in order to keep data clean.
The output in this process is a matrix X within 4 attributes and powered by 12, and also matrix Y is contained the survival column

Check the data if any Nan

In [379]:
pclass = df['Pclass'].isnull().values.any()
print("Pclass is Nan = ", pclass)
sex = df['Sex'].isnull().values.any()
print("Sex is Nan = ", sex)
sibsp = df['SibSp'].isnull().values.any()
print("SibSp is Nan = ", sibsp)
parch = df['Parch'].isnull().values.any()
print("Parch is Nan = ", parch)

Pclass is Nan =  False
Sex is Nan =  False
SibSp is Nan =  False
Parch is Nan =  False


Create matrix X and Y

In [380]:
pclass = df['Pclass'].values
pclass = np.transpose([pclass])

sex = df['Sex'].values
sex = np.transpose([np.where(sex == 'male', 0, 1)])

sibsp = df['SibSp'].values
sibsp = np.transpose([sibsp])

parch = df['Parch'].values
parch = np.transpose([parch])

X = np.hstack((pclass, sex))
X = np.hstack((X, sibsp))
X = np.hstack((X, parch))

y = np.transpose([df['Survived'].values])

Create Matrix X with powered by 8 (e.g. x1^8, x1^7x2x3x4, x1^6x2^2x3x4,...)
Matrix X size will be 891 x 495

In [381]:
def mapFeature(X, degree):
  finalX = np.ones((X.shape[0],1),dtype='int')

  for i in range(degree+1):
    for j in range(degree+1):
      for k in range(degree+1):
        for l in range(degree+1):
          if(i+j+k+l != 0 and i+j+k+l <= degree):
            temp = np.transpose([np.power(X[:,0],i)*np.power(X[:,1],j)*np.power(X[:,2],k)*np.power(X[:,3],l)])
            finalX = np.hstack((finalX, temp))
  
  return finalX

X = mapFeature(X, 8)
X.shape

(891, 495)

# Machine Learning Algorithm
In this section will be build a machine learning algorithm using logistic regression and with minimum function that build-in Scikit

Create Sigmoid function

In [382]:
def sigmoid(z):
  g = np.zeros(z.size)
  g = 1 / (1 + np.exp(-z))
  return g

Create Cost and Gradient Function 

In [383]:
def costFunc(theta, X, y, lamda):
  theta = np.transpose(np.array([theta]))

  m = len(y)

  J = 0

  hx = sigmoid(X @ theta)
  J = 1/m * (-np.transpose(y) @ np.log(hx) - np.transpose(1-y) @ np.log(1-hx)) + lamda/(2 * m) * np.transpose(theta[1:]) @ theta[1:]
  return J[0,0]

In [384]:
def gradient(theta, X, y, lamda):
  theta = np.transpose(np.array([theta]))

  m = len(y)
  
  grad = np.zeros(theta.size)

  hx = sigmoid(X @ theta)
  grad = 1/m * np.transpose(X) @ (hx-y) + np.vstack(([0], lamda / m * theta[1:]))
  return np.transpose(grad).flatten()

Initialize theta, set lambda, and compute cost and gradient descend

In [385]:
import scipy.optimize as op

initial_theta = np.zeros((X.shape[1],))
lamda = 200

cost = costFunc(initial_theta, X, y, lamda)
print(cost)
grad = gradient(initial_theta, X, y, lamda)
print(grad[0:5])

0.693147180559945
[0.11616162 0.01234568 0.08641975 0.45342312 2.30190797]


In [386]:
Result = op.minimize(fun = costFunc, 
                                 x0 = initial_theta, 
                                 args = (X, y, lamda),
                                 method = 'SLSQP',
                                 jac = gradient);

  This is separate from the ipykernel package so we can avoid doing imports until
  if __name__ == '__main__':
  if __name__ == '__main__':


In [387]:
print(Result)
print(len(Result.x))

     fun: 0.4228629946503062
     jac: array([-4.69053940e-04,  1.54506765e-04,  1.26345671e-04, -5.95487830e-04,
       -5.50541196e-03, -3.25825512e-02, -1.70425569e-01, -8.44767148e-01,
       -4.06443110e+00,  8.61197651e-05,  2.22304062e-04,  3.06541631e-04,
       -6.12555444e-05, -3.85754139e-03, -2.76427671e-02, -1.57433559e-01,
       -8.23753901e-01,  3.44597250e-04,  7.15142867e-04,  1.18907522e-03,
        1.64147645e-03, -3.91986042e-04, -2.02842855e-02, -1.41187386e-01,
        1.31968508e-03,  2.54563873e-03,  4.51248990e-03,  8.03234646e-03,
        1.23786477e-02,  5.98043481e-03,  4.93708506e-03,  9.32850981e-03,
        1.70109840e-02,  3.21253220e-02,  6.01502923e-02,  1.83124133e-02,
        3.45446930e-02,  6.41396648e-02,  1.23405469e-01,  6.79418842e-02,
        1.28871211e-01,  2.42706130e-01,  2.53687604e-01,  4.85020848e-01,
        9.60246783e-01,  1.84282196e-04,  3.03509415e-05, -1.76637472e-04,
       -1.34444096e-03, -7.36642731e-03, -3.71083885e-02, -1.

# Training Accuration
In this section we will count the accuration of the training data

Create predict function

In [388]:
def predict(theta, X):
  theta = np.transpose(np.array([theta]))

  m = X.shape[0]
  p = np.zeros((m,1))

  p = sigmoid(X @ theta)
  p = np.where(p < 0.5, 0, 1)
  return p

Calculate the accuration

In [389]:
p = predict(Result.x, X)
print("Accuracy =", np.equal(y, p).mean() * 100)

Accuracy = 81.48148148148148


# Implementing to new data
In this section we implement to the another data and predict whether person is survive or not

Upload the test data

In [391]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

df2 = pd.read_csv(io.BytesIO(uploaded['test.csv']))
df2

Saving test.csv to test.csv
User uploaded file "test.csv" with length 28629 bytes


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


Check the data if any Nan

In [393]:
pclass = df2['Pclass'].isnull().values.any()
print("Pclass is Nan = ", pclass)
sex = df2['Sex'].isnull().values.any()
print("Sex is Nan = ", sex)
sibsp = df2['SibSp'].isnull().values.any()
print("SibSp is Nan = ", sibsp)
parch = df2['Parch'].isnull().values.any()
print("Parch is Nan = ", parch)

Pclass is Nan =  False
Sex is Nan =  False
SibSp is Nan =  False
Parch is Nan =  False


Create matrix X

In [395]:
pclass = df2['Pclass'].values
pclass = np.transpose([pclass])

sex = df2['Sex'].values
sex = np.transpose([np.where(sex == 'male', 0, 1)])

sibsp = df2['SibSp'].values
sibsp = np.transpose([sibsp])

parch = df2['Parch'].values
parch = np.transpose([parch])

X = np.hstack((pclass, sex))
X = np.hstack((X, sibsp))
X = np.hstack((X, parch))

Create Matrix X with powered by 8 (e.g. x1^8, x1^7x2x3x4, x1^6x2^2x3x4,...)
Matrix X size will be 418 x 495

In [396]:
def mapFeature(X, degree):
  finalX = np.ones((X.shape[0],1),dtype='int')

  for i in range(degree+1):
    for j in range(degree+1):
      for k in range(degree+1):
        for l in range(degree+1):
          if(i+j+k+l != 0 and i+j+k+l <= degree):
            temp = np.transpose([np.power(X[:,0],i)*np.power(X[:,1],j)*np.power(X[:,2],k)*np.power(X[:,3],l)])
            finalX = np.hstack((finalX, temp))
  
  return finalX

X = mapFeature(X, 8)
X.shape

(418, 495)

Calculate the prediction

In [397]:
p = predict(Result.x, X)

[[0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]


  This is separate from the ipykernel package so we can avoid doing imports until


# Export to CSV

In [404]:
d = {'PassengerId': df2['PassengerId'].values, 'Survived': np.transpose(p).flatten()}
df3 = pd.DataFrame(data=d)
df3

csv = df3.to_csv(index=False)

from google.colab import files

with open('result.csv', 'w') as f:
  f.write(csv)

files.download('result.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>