In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [2]:
# Import the data
df = pd.read_csv("C:/Users/z011348/Desktop/ML/input/titanic/train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# Make a copy of original dataframe - for future reference
df_tmp = df.copy()

In [4]:
# There are some column data is not needed for predictions. So we will delete them from Dataframe
df_tmp.drop(['Name','Ticket', 'PassengerId', 'Fare'],axis=1,inplace=True)

In [5]:
df_tmp.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.0,1,0,,S
1,1,1,female,38.0,1,0,C85,C
2,1,3,female,26.0,0,0,,S
3,1,1,female,35.0,1,0,C123,S
4,0,3,male,35.0,0,0,,S
5,0,3,male,,0,0,,Q
6,0,1,male,54.0,0,0,E46,S
7,0,3,male,2.0,3,1,,S
8,1,3,female,27.0,0,2,,S
9,1,2,female,14.0,1,0,,C


In [6]:
df_tmp.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Cabin       687
Embarked      2
dtype: int64

In [7]:
# Import the data and drop missing labels
df_tmp.dropna(subset=["Survived"], inplace=True)

In [8]:
df_tmp

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.0,1,0,,S
1,1,1,female,38.0,1,0,C85,C
2,1,3,female,26.0,0,0,,S
3,1,1,female,35.0,1,0,C123,S
4,0,3,male,35.0,0,0,,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,,S
887,1,1,female,19.0,0,0,B42,S
888,0,3,female,,1,2,,S
889,1,1,male,26.0,0,0,C148,C


In [9]:
# Importing the SimpleImputer class 
from sklearn.impute import SimpleImputer

In [10]:
# Imputer object using the mean strategy and  
# missing_values type for imputation 
imputer = SimpleImputer(missing_values = np.nan,  
                        strategy ='mean')

In [11]:
df_tmp.Age = imputer.fit_transform(df_tmp['Age'].values.reshape(-1,1))[:,0]

In [12]:
imputer = SimpleImputer(missing_values = np.nan,  
                        strategy ='most_frequent')

In [13]:
df_tmp.Cabin = imputer.fit_transform(df_tmp['Cabin'].values.reshape(-1,1))[:,0]

In [14]:
df_tmp.Embarked = imputer.fit_transform(df_tmp['Embarked'].values.reshape(-1,1))[:,0]

In [15]:
df_tmp

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.000000,1,0,B96 B98,S
1,1,1,female,38.000000,1,0,C85,C
2,1,3,female,26.000000,0,0,B96 B98,S
3,1,1,female,35.000000,1,0,C123,S
4,0,3,male,35.000000,0,0,B96 B98,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,B96 B98,S
887,1,1,female,19.000000,0,0,B42,S
888,0,3,female,29.699118,1,2,B96 B98,S
889,1,1,male,26.000000,0,0,C148,C


In [16]:
df_tmp.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Cabin       0
Embarked    0
dtype: int64

In [45]:
lr = LogisticRegression()

In [49]:
# Split into X and y (on train set)
X = df_tmp.drop("Survived", axis=1)
y = df_tmp["Survived"]

In [50]:
X.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Cabin       0
Embarked    0
dtype: int64

In [51]:
#df_object = df_tmp.select_dtypes('object')
#df_object

In [52]:
#########################################################
############  OneHot ####################################
#########################################################

In [53]:
# for Data ready
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [54]:
ohe = OneHotEncoder(sparse=False)

In [55]:
c_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), ['Sex', 'Cabin', 'Embarked']),
                                 remainder='passthrough')

In [56]:
c_trans.fit(X)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('onehotencoder',
                                 OneHotEncoder(categories='auto', drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='ignore',
                                               sparse=True),
                                 ['Sex', 'Cabin', 'Embarked'])],
                  verbose=False)

In [57]:
c_trans.transform(X)

<891x156 sparse matrix of type '<class 'numpy.float64'>'
	with 4951 stored elements in Compressed Sparse Row format>

In [58]:
from sklearn.pipeline import make_pipeline 

In [59]:
pipe = make_pipeline(c_trans, lr)

In [62]:
from sklearn.model_selection import cross_val_score

In [70]:
cross_val_score(pipe, X, y, cv=4, scoring='accuracy').mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.803604613582192