In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [77]:
# Import the data
df = pd.read_csv("C:/Users/z011348/Desktop/ML/input/titanic/train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [78]:
# Make a copy of original dataframe - for future reference
df_tmp = df.copy()

In [79]:
# There are some column data is not needed for predictions. So we will delete them from Dataframe
df_tmp.drop(['Name','Ticket', 'PassengerId', 'Fare'],axis=1,inplace=True)

In [80]:
df_tmp.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.0,1,0,,S
1,1,1,female,38.0,1,0,C85,C
2,1,3,female,26.0,0,0,,S
3,1,1,female,35.0,1,0,C123,S
4,0,3,male,35.0,0,0,,S
5,0,3,male,,0,0,,Q
6,0,1,male,54.0,0,0,E46,S
7,0,3,male,2.0,3,1,,S
8,1,3,female,27.0,0,2,,S
9,1,2,female,14.0,1,0,,C


In [81]:
df_tmp.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Cabin       687
Embarked      2
dtype: int64

In [82]:
# Import the data and drop missing labels
df_tmp.dropna(subset=["Survived"], inplace=True)

In [83]:
df_tmp

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.0,1,0,,S
1,1,1,female,38.0,1,0,C85,C
2,1,3,female,26.0,0,0,,S
3,1,1,female,35.0,1,0,C123,S
4,0,3,male,35.0,0,0,,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,,S
887,1,1,female,19.0,0,0,B42,S
888,0,3,female,,1,2,,S
889,1,1,male,26.0,0,0,C148,C


In [84]:
# Importing the SimpleImputer class 
from sklearn.impute import SimpleImputer

In [85]:
# Imputer object using the mean strategy and  
# missing_values type for imputation 
imputer = SimpleImputer(missing_values = np.nan,  
                        strategy ='mean')

In [86]:
df_tmp.Age = imputer.fit_transform(df_tmp['Age'].values.reshape(-1,1))[:,0]

In [87]:
imputer = SimpleImputer(missing_values = np.nan,  
                        strategy ='most_frequent')

In [88]:
df_tmp.Cabin = imputer.fit_transform(df_tmp['Cabin'].values.reshape(-1,1))[:,0]

In [89]:
df_tmp.Embarked = imputer.fit_transform(df_tmp['Embarked'].values.reshape(-1,1))[:,0]

In [90]:
df_tmp

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.000000,1,0,B96 B98,S
1,1,1,female,38.000000,1,0,C85,C
2,1,3,female,26.000000,0,0,B96 B98,S
3,1,1,female,35.000000,1,0,C123,S
4,0,3,male,35.000000,0,0,B96 B98,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,B96 B98,S
887,1,1,female,19.000000,0,0,B42,S
888,0,3,female,29.699118,1,2,B96 B98,S
889,1,1,male,26.000000,0,0,C148,C


In [91]:
df_tmp.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Cabin       0
Embarked    0
dtype: int64

In [92]:
df_object = df_tmp.select_dtypes('object')
df_object

Unnamed: 0,Sex,Cabin,Embarked
0,male,B96 B98,S
1,female,C85,C
2,female,B96 B98,S
3,female,C123,S
4,male,B96 B98,S
...,...,...,...
886,male,B96 B98,S
887,female,B42,S
888,female,B96 B98,S
889,male,C148,C


In [93]:
#########################################################
############  OneHot ####################################
#########################################################

In [94]:
# for Data ready
from sklearn.preprocessing import OneHotEncoder

In [95]:
# creating instance of one-hot-encoder
ohe = OneHotEncoder(handle_unknown='ignore')

In [96]:
ohe.fit(df_object)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)

In [97]:
codes = ohe.transform(df_object).toarray()

In [98]:
feature_names = ohe.get_feature_names(['Sex', 'Cabin', 'Embarked'])

In [99]:
df_x = pd.concat([df_tmp.select_dtypes(exclude='object'), 
               pd.DataFrame(codes,columns=feature_names).astype(int)], axis=1)

In [100]:
df_x

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Cabin_A10,Cabin_A14,Cabin_A16,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.000000,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,38.000000,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,3,26.000000,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,35.000000,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,3,35.000000,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
887,1,1,19.000000,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
888,0,3,29.699118,1,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
889,1,1,26.000000,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [101]:
X = df_x.drop("Survived", axis=1)
y = df_x['Survived']

np.random.seed(42)
# split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 156), (179, 156), (712,), (179,))

In [123]:
model = LogisticRegression()

In [124]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [125]:
model.score(X_test, y_test)

0.8156424581005587