# Logistic Regression

## Cleaning and filtering of the dataset 

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [3]:
df = pd.read_csv("train.csv")
df

FileNotFoundError: [Errno 2] File b'train.csv' does not exist: b'train.csv'

We  first filter the columns with to much nan

In [169]:

def nanRate(x):
    k = 0
    for i in x:
        if i !=i :#a nan is not equal to itself
            k += 1
    rate = k/ len(x)
    return rate


clean = []
for i in df.columns:
    if nanRate(df[i]) >= 0.7:
        clean.append(i)
print(clean)
df.drop(clean,axis=1, inplace = True)
df.head()     


['Cabin']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,O,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,O,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Now we just clear the rows containing Nan values

In [170]:
rowsToBeDeleted = []
for i in df.columns : 
    k = 0
    for j in df[i]:
        if j != j:
            rowsToBeDeleted.append(k)
        k += 1
rowsToBeDeleted = list(set(rowsToBeDeleted)) 
"""we convert the list in a set to delete every multiple occurences,
and then we convert it back to a list"""
print(rowsToBeDeleted)
df.drop(rowsToBeDeleted ,axis=0, inplace = True)
df.reset_index(drop = True, inplace = True)

[5, 517, 522, 524, 13, 527, 17, 19, 531, 533, 26, 538, 28, 29, 31, 32, 547, 36, 552, 42, 45, 46, 47, 48, 557, 560, 563, 564, 55, 568, 573, 61, 64, 65, 578, 584, 588, 76, 77, 589, 593, 82, 596, 598, 87, 601, 602, 95, 611, 612, 101, 613, 107, 109, 629, 121, 633, 126, 639, 128, 643, 648, 650, 140, 653, 656, 154, 667, 669, 158, 159, 674, 166, 168, 680, 176, 180, 181, 692, 185, 186, 697, 708, 196, 198, 709, 711, 201, 718, 214, 727, 732, 223, 738, 739, 740, 229, 235, 240, 241, 760, 250, 766, 256, 768, 260, 773, 264, 776, 778, 268, 270, 783, 274, 277, 790, 792, 793, 284, 295, 298, 300, 301, 303, 304, 815, 306, 825, 826, 828, 829, 832, 324, 837, 839, 330, 334, 335, 846, 849, 347, 859, 351, 863, 354, 868, 358, 359, 364, 878, 367, 368, 375, 888, 384, 388, 409, 410, 411, 413, 415, 420, 425, 428, 431, 444, 451, 454, 457, 459, 464, 466, 468, 470, 475, 481, 485, 490, 495, 497, 502, 507, 508, 511]


In [171]:
df["Embarked"].unique

<bound method Series.unique of 0      S
1      C
2      S
3      S
4      S
      ..
702    Q
703    S
704    S
705    C
706    Q
Name: Embarked, Length: 707, dtype: object>

We reprocess the data in order to use them in our regression

In [172]:
def normalize(x):
    normal=(x-np.mean(x))/np.std(x)
    return normal

In [173]:
df["Survived"] = df["Survived"].replace('O', 0)
df["Survived"] = df["Survived"].replace('1',1)

df["Sex"] = df["Sex"].replace('female', 0)
df["Sex"] = df["Sex"].replace('femme', 0)
df["Sex"] = df["Sex"].replace('male', 1)

oh = pd.get_dummies(df['Embarked'], prefix = 'Embarked')
df = df.drop('Embarked', axis = 1)
df = df.join(oh)

oh = pd.get_dummies(df['Pclass'], prefix = 'Pclass')
df = df.drop('Pclass', axis = 1)
df = df.join(oh)


norm = normalize(df['Age'])
df.drop('Age', axis = 1, inplace = True)
df = df.join(norm)



In [174]:
X = np.array (df[[ 'Sex', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Age']])

Y = df['Survived']

In [175]:
np.shape(X)

(707, 8)

# Logistic Regression From scratch 

In [176]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [177]:
def cost(h, y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

In [188]:
def gradientdesc(X, Y, alpha, ite):
    XB = np.zeros((len(X),1))
    X = np.insert(X,0, XB, axis=1)
    
    XT = np.transpose(X)
    #print(len(X))

    #print("dim omega",len(X))
    #print(len(X))
    omega = np.zeros(len(X[0]))
    for i in range(ite):

        pred = sigmoid(np.dot(X, omega))
        grad = np.dot(XT,(pred - Y))*(1/(len(X)))
        omega = omega - alpha * grad #updated omega
    return omega

In [185]:
def predict (X,Y,alpha,ite):
    omega = gradientdesc(X,Y,alpha,ite)
    XB = np.zeros((len(X),1))
    X = np.insert(X,0, XB, axis=1)
    #print(np.shape(X))
    #print(np.shape(omega))
    y_predict = sigmoid(X.dot(omega))
    
    classification = [0 if i<0.5 else 1 for i in y_predict]
    return classification

In [191]:
metrics.accuracy_score(Y,predict(X,Y, 0.01, 10000))

0.7949080622347949

## With sklearn

In [181]:
model = LogisticRegression().fit(X,Y)
y_pred = model.predict(X)
metrics.accuracy_score(Y, y_pred)



0.7949080622347949