In [226]:
##importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
gender_submission = pd.read_csv('./gender_submission.csv')
test_labels = gender_submission['Survived']

test_data_passenderIds = test_data['PassengerId']

# Get rid of columns that are not useful
train_data.drop(columns= ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)
test_data.drop(columns= ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)

# Seperate X and y
# store feature matrix in "X"
X = train_data.iloc[:, 1:]   
# store response vector in "y"
y = train_data.iloc[:,0]    

X.info(),X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


(None,
    Pclass     Sex   Age  SibSp  Parch     Fare Embarked
 0       3    male  22.0      1      0   7.2500        S
 1       1  female  38.0      1      0  71.2833        C
 2       3  female  26.0      0      0   7.9250        S
 3       1  female  35.0      1      0  53.1000        S
 4       3    male  35.0      0      0   8.0500        S)

In [227]:
X.iloc[0]

Pclass         3
Sex         male
Age         22.0
SibSp          1
Parch          0
Fare        7.25
Embarked       S
Name: 0, dtype: object

In [228]:
## Preprocess the PClass column 
def preprocess_pclass_cols(df):
    #converting integer classes to Letters and prepare for One Hot Encoding
    df['Pclass'] = df['Pclass'].map({1: 'AC', 2: 'BC', 3 : 'CC'})
    return df

In [229]:
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']
    num_attrs = data.select_dtypes(include=numerics)

    ## pipeline for numerical attributes
    ## imputing -> Scale them

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('std_scalar', StandardScaler()),
    ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ['Pclass',"Sex","Embarked"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    ## Complete pipeline to transform
    ## both Num and Cat attributes
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
    ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

# From raw data to processed data in 2 steps

In [230]:
## From raw data to processed data in 2 steps
preprocessed_df = preprocess_pclass_cols(X)
prepared_data = pipeline_transformer(X)
prepared_data,X.iloc[0]

(array([[-0.56573646,  0.43279337, -0.47367361, ...,  0.        ,
          1.        ,  0.        ],
        [ 0.66386103,  0.43279337, -0.47367361, ...,  0.        ,
          0.        ,  0.        ],
        [-0.25833709, -0.4745452 , -0.47367361, ...,  0.        ,
          1.        ,  0.        ],
        ...,
        [-0.1046374 ,  0.43279337,  2.00893337, ...,  0.        ,
          1.        ,  0.        ],
        [-0.25833709, -0.4745452 , -0.47367361, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.20276197, -0.4745452 , -0.47367361, ...,  1.        ,
          0.        ,  0.        ]]),
 Pclass        CC
 Sex         male
 Age         22.0
 SibSp          1
 Parch          0
 Fare        7.25
 Embarked       S
 Name: 0, dtype: object)

In [231]:
prepared_data[0]

array([-0.56573646,  0.43279337, -0.47367361, -0.50244517,  0.        ,
        0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ])

# Training the Logistic Regression model on the Training set

In [232]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(prepared_data,y)

LogisticRegression(random_state=0)

In [233]:
sample_data = X.iloc[:]
sample_labels = y.iloc[:]
sample_data_prepared = pipeline_transformer(sample_data)
train_pred = classifier.predict(sample_data_prepared)
print("Prediction on Sample data : ", train_pred[:5])
print("Actual Labels : ",np.array(sample_labels[:5]))

Prediction on Sample data :  [0 1 1 1 0]
Actual Labels :  [0 1 1 1 0]


# Making the Confusion Matrix

In [234]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(sample_labels, train_pred)
print(cm)
accuracy_score(sample_labels, train_pred)

[[478  71]
 [102 240]]


0.8058361391694725

In [235]:
test_data.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [236]:
test_data_prepared = pipeline_transformer(test_data)
test_pred = classifier.predict(test_data_prepared)
print("Prediction on Sample data : ", test_pred[:5])
print("Actual Labels : ",np.array(test_labels[:5]))

Prediction on Sample data :  [1 0 0 1 0]
Actual Labels :  [0 1 0 0 1]


In [237]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(test_labels, test_pred)
print(cm)
accuracy_score(test_labels, test_pred)

[[ 58 208]
 [106  46]]


0.24880382775119617

In [238]:
file_name = "out.csv"

with open('output/'+file_name,'w') as f:
    f.write('{0},{1}\n'.format("PassengerId", "Survived"))
    for p,s in zip(test_data_passenderIds, y_pred):
        f.write('{0},{1}\n'.format(p, s))