# Titanic Survival Classificaion:


### Required Imports :

In [None]:
#required libraries:
import pandas as pd;    
import numpy as np;

### Utility Functions:

In [None]:
#importing training and test data into pandas dataframe
def import_data(path):
    return pd.read_csv(path)



In [None]:
#utility functions
def print_info(df):
    return df.info()

def print_description(df):
    return df.describe()

def print_head(df, count=5):
    return df.head(count)

### Preprocess both training and testing dataframe :

In [None]:
def numeric_mapper(df, column_name):
    mapper = {}
    data_list = df[column_name].unique()
    data_list = ['missing' if pd.isnull(x) else x for x in data_list]
    data_list.sort()
    for i in range(0, len(data_list)):
        if data_list[i] == 'missing':
            mapper[np.nan] = 404
        else:    
            mapper[data_list[i]] = i
    return mapper
    
def data_preprocessor(df):
 
    df = (df.rename({'SibSp' : '# of Siblings', 
                                             'Parch': '# of Parents', 
                                             'Sex' : 'Gender',
                                             'Pclass' : 'Class'},
                                            axis = 1)
                                    .drop(['Name', 'Ticket', 'PassengerId'], axis = 1)
                                    .astype({'Gender' : pd.api.types.CategoricalDtype(df['Sex'].unique(), ordered=False), 
                                             'Class' : pd.api.types.CategoricalDtype(df['Pclass'].unique(), ordered=True)})
#                                     .replace({'Embarked' : {np.NaN : 'un-known'}})
                                    .replace({'Gender' : numeric_mapper(df, 'Sex'),
                                              'Embarked' : numeric_mapper(df, 'Embarked'),
                                              'Cabin' : numeric_mapper(df, 'Cabin'),
                                              'Age' : {np.nan : 404}})
                                    
    #                                 .loc[:]
                      )
    return df

### Dealing With Null Values In Attributes :

#### CASE 1 : Removing attributes with null values:

In [None]:
# In training only 2 attributes contains a major chunk of null data : Age and Cabin
def drop_null_attributes(df):
    df = (df.dropna(axis='columns')
         )
    return df

#### CASE 2: Remove rows with null values:

In [None]:
def drop_null_rows(df):
    df = (df.dropna(axis='rows')
         )
    return df

#### CASE 3: Replacing null attribute values with mean, median or mode

In [None]:
# replace nulls with averages 
def replace_null_with_mean(col):
    if col.dtype.name == "category":
        col = col.replace(np.nan,col.mode())
    else:
        col = col.replace(np.nan, col.mean())
    return col

def replace_nulls_phase_1(df):
    df = (df.apply(replace_null_with_mean, axis = 0)
         )
    return df

#### CASE 4: Group attributes with similar values and replace null values with mean or mode values of that specific group

In [None]:
# Function to return attributes with greatest correlation with the provided attribute
def top_correlations(df,attribute, count = 2):
    print(attribute)
    correlations_df  = df.corr()
    correlation_attribute = correlations_df[attribute]
    correlation_attribute = correlation_attribute.to_frame()
    correlation_attribute[attribute+'_mod'] = [ x if x > 0 else -1*x for x in correlations_df[attribute]]
    return (correlation_attribute.sort_values(attribute+'_mod', ascending = False)[1: (count+1)].loc[:, attribute])
    

In [None]:
# get attribute with atleast 1 cell having a null value
def get_attributes_with_nulls(df):
    df_nulls = pd.Series(titanic_train_4.isna().any())
    return df_nulls[df_nulls == True].index.tolist()

### Classification Models :

#### Logistic Regression 

In [None]:
def create_logistic_regression_model(df):
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns!= 'Survived'], df['Survived'], test_size = 0.75, random_state = 0)
    clf = LogisticRegression(solver = 'liblinear') #solver : liblinear good for small datasets
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)

### Test Cases :

In [None]:
titanic_train = import_data("./../../../Datasets/Titanic/train.csv");
titanic_test =  import_data("./../../../Datasets/Titanic/test.csv")

In [None]:
# preprocessing phase 1 for training and validation dataset
titanic_train = data_preprocessor(titanic_train)
print_head(titanic_train)

In [None]:
titanic_test = data_preprocessor(titanic_test)
print_head(titanic_test)

#### Test Case 1:

In [None]:
#removed columns with null values
titanic_train_1 = titanic_train.replace({404 : np.nan})
titanic_train_1 = drop_null_attributes(titanic_train_1)
create_logistic_regression_model(titanic_train_1)

#### Test Case 2:

In [None]:
# removed rows with null values
titanic_train_2 = titanic_train.replace({404 : np.nan})
titanic_train_2 = drop_null_rows(titanic_train_2)
create_logistic_regression_model(titanic_train_2)

#### Test Case 3:

In [None]:
# replaced null values with mean and mode
titanic_train_3 = titanic_train.replace({404 : np.nan})
titanic_train_3 = replace_nulls_phase_1(titanic_train_3)
create_logistic_regression_model(titanic_train_3)

#### Test Case 4:

In [None]:
titanic_train_4 = (titanic_train.replace({404 : np.nan})
                                .astype({'Class' : int})
                  )
titanic_train_4.head()

In [None]:
correlation = titanic_train_4.corr()

In [None]:
correlation

In [None]:
correlation["Age"].sort_values(ascending = False)

In [None]:
correlation["Age_mod"] = [ x if x > 0 else -1*x for x in correlation["Age"]]

In [None]:
correlation["Age_mod"].sort_values(ascending = False)

In [None]:
data = top_correlations(titanic_train_4, 'Cabin')

In [None]:
get_attributes_with_nulls(titanic_train_4)