In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
data = pd.read_csv('titanic.csv')

In [3]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   float64
 1   survived   1309 non-null   float64
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   float64
 6   parch      1309 non-null   float64
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(7), object(7)
memory usage: 143.4+ KB


In [5]:
data_detail = {'unique':data.nunique(),
              'type':data.dtypes,
              'null':data.isna().sum(),
              'null %':data.isna().sum()/len(data)}

In [6]:
d_detail = pd.DataFrame(data_detail)

In [7]:
d_detail

Unnamed: 0,unique,type,null,null %
pclass,3,float64,1,0.000763
survived,2,float64,1,0.000763
name,1307,object,1,0.000763
sex,2,object,1,0.000763
age,98,float64,264,0.201527
sibsp,7,float64,1,0.000763
parch,8,float64,1,0.000763
ticket,929,object,1,0.000763
fare,281,float64,2,0.001527
cabin,186,object,1015,0.774809


In [8]:
data.shape

(1310, 14)

In [9]:
data = data.drop('cabin',axis=1)

In [10]:
data.describe(include='object').transpose()

Unnamed: 0,count,unique,top,freq
name,1309,1307,"Connolly, Miss. Kate",2
sex,1309,2,male,843
ticket,1309,929,CA. 2343,11
embarked,1307,3,S,914
boat,486,27,13,39
home.dest,745,369,"New York, NY",64


In [11]:
data = data.fillna('')

In [12]:
data.isnull().sum()

pclass       0
survived     0
name         0
sex          0
age          0
sibsp        0
parch        0
ticket       0
fare         0
embarked     0
boat         0
body         0
home.dest    0
dtype: int64

In [13]:
data.shape

(1310, 13)

In [14]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,S,,,"Montreal, PQ / Chesterville, ON"


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
LE = LabelEncoder()

In [33]:
data['boat'] = LE.fit_transform(data['boat'])
data['embarked'] = LE.fit_transform(data['embarked'])
data['sex'] = LE.fit_transform(data['sex'])
data['home.dest'] = LE.fit_transform(data['home.dest'])

In [34]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",1,29.0,0.0,0.0,24160,211.3375,3,12,,309
1,1.0,1.0,"Allison, Master. Hudson Trevor",2,0.9167,1.0,2.0,113781,151.55,3,3,,231
2,1.0,0.0,"Allison, Miss. Helen Loraine",1,2.0,1.0,2.0,113781,151.55,3,0,,231
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",2,30.0,1.0,2.0,113781,151.55,3,0,135.0,231
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,25.0,1.0,2.0,113781,151.55,3,0,,231


In [45]:
X = data.drop(['name','survived','ticket','embarked','boat','body','home.dest','sex','pclass'],axis=1)
Y = data['survived']
print(X)
print(Y)

     pclass     age sibsp parch      fare  embarked  boat
0       1.0    29.0   0.0   0.0  211.3375         3    12
1       1.0  0.9167   1.0   2.0    151.55         3     3
2       1.0     2.0   1.0   2.0    151.55         3     0
3       1.0    30.0   1.0   2.0    151.55         3     0
4       1.0    25.0   1.0   2.0    151.55         3     0
...     ...     ...   ...   ...       ...       ...   ...
1305    3.0           1.0   0.0   14.4542         1     0
1306    3.0    26.5   0.0   0.0     7.225         1     0
1307    3.0    27.0   0.0   0.0     7.225         1     0
1308    3.0    29.0   0.0   0.0     7.875         3     0
1309                                              0     0

[1310 rows x 7 columns]
0       1.0
1       1.0
2       0.0
3       0.0
4       0.0
       ... 
1305    0.0
1306    0.0
1307    0.0
1308    0.0
1309       
Name: survived, Length: 1310, dtype: object


In [36]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,random_state=2,test_size=0.1)

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
LR = LogisticRegression()

In [39]:
LR.fit(x_train,y_train)

ValueError: could not convert string to float: ''