In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [2]:
train = pd.read_csv("train_titanic.csv")
test = pd.read_csv("test_titanic.csv")

In [3]:
train.columns.difference(test.columns)

Index(['Survived'], dtype='object')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df = pd.concat([train,test], ignore_index= True)

In [6]:
df.tail(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1308,1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [7]:
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [8]:
df["Age"].fillna(df["Age"].mode()[0], inplace = True)

In [11]:
df["Sex"].value_counts(dropna =False)

male      843
female    466
Name: Sex, dtype: int64

In [13]:
df["Embarked"].value_counts(dropna =False)

S      914
C      270
Q      123
NaN      2
Name: Embarked, dtype: int64

In [14]:
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace = True)

In [16]:
df.replace({"Sex":{"male":0,"female":1},"Embarked":{"S":1,"C":2,"Q":3}}, inplace = True)

In [17]:
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          0
dtype: int64

In [18]:
pre_train = df[df["Survived"].notna()]
pre_test = df[df["Survived"].isna()]

In [19]:
pre_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,2
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1


In [44]:
pre_train.tail(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0.0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0,,1
887,888,1.0,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0,B42,1
888,889,0.0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,24.0,1,2,W./C. 6607,23.45,,1
889,890,1.0,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0,C148,2
890,891,0.0,3,"Dooley, Mr. Patrick",0,32.0,0,0,370376,7.75,,3


In [47]:
x = pre_train.drop(["Survived","PassengerId","Name","Ticket","Cabin"], axis = 1)
y = pre_train["Survived"]

In [48]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [49]:
model = LogisticRegression()

In [50]:
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [51]:
model.coef_, model.intercept_

(array([[-0.95142991,  2.59489218, -0.02865689, -0.29674397, -0.1041223 ,
          0.00297134,  0.19872445]]),
 array([1.30892995]))

In [52]:
y_pred = model.predict(x_train)
print('Train accuracy')
print('accuracy score',accuracy_score(y_train,y_pred))
print('f1 score',f1_score(y_train,y_pred))
print('confusion matrix\n',confusion_matrix(y_train,y_pred))

Train accuracy
accuracy score 0.800561797752809
f1 score 0.720472440944882
confusion matrix
 [[387  57]
 [ 85 183]]


In [55]:
y_test_pred = model.predict(x_test)
print('Test accuracy')
print('accuracy score',accuracy_score(y_test,y_test_pred))
print('f1 score',f1_score(y_test,y_test_pred))
print('confusion matrix\n',confusion_matrix(y_test,y_test_pred))

Test accuracy
accuracy score 0.7988826815642458
f1 score 0.75
confusion matrix
 [[89 16]
 [20 54]]
