# Titanic dataset

## import libraries

In [198]:
import pandas as pd
import numpy as np
import seaborn as sns

## load dataset

In [199]:
df = sns.load_dataset("titanic")

In [200]:
df.shape

(891, 15)

In [201]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [202]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

## creat datadict

* #### Check Data Types

In [203]:
datatypes = pd.DataFrame(df.dtypes, columns=["data types"])

In [204]:
datatypes

Unnamed: 0,data types
survived,int64
pclass,int64
sex,object
age,float64
sibsp,int64
parch,int64
fare,float64
embarked,object
class,category
who,object


In [205]:
# check number of uniqe values for object datatypes
{col:df[col].nunique() for col in df if df[col].dtype==object}

{'sex': 2, 'embarked': 3, 'who': 3, 'embark_town': 3, 'alive': 2}

In [206]:
df= df.astype(
    {
        'alive':'category',
        'sex':'category',
        'embarked':'category',
        'who':'category',
        'embark_town':'category',   
    }
)

In [207]:
# check codes in categorial data
df["alive"].cat.categories

Index(['no', 'yes'], dtype='object')

In [208]:
df["sex"].cat.categories

Index(['female', 'male'], dtype='object')

In [209]:
df.alive = df["alive"].cat.codes

In [210]:
df.sex = df["sex"].cat.codes

In [211]:
datadict = pd.DataFrame(df.dtypes, columns=["dtypes"], dtype=str)
datadict 

Unnamed: 0,dtypes
survived,int64
pclass,int64
sex,int8
age,float64
sibsp,int64
parch,int64
fare,float64
embarked,category
class,category
who,category


In [212]:
datadict[datadict["dtypes"]=="category"]

Unnamed: 0,dtypes
embarked,category
class,category
who,category
deck,category
embark_town,category


In [213]:
datadict[datadict["dtypes"].str.contains("int")]

Unnamed: 0,dtypes
survived,int64
pclass,int64
sex,int8
sibsp,int64
parch,int64
alive,int8


* #### columns defination

In [214]:
datadict["definations"] = ["Survival",
                                     "A proxy for socio-economic status (SES)",
                                     "Sex" ,
                                     "Age in years" ,
                                     "# of siblings(brother, sister) and spouses ( husband, wife )",
                                     '# of parents / children aboard the Titanic',
                                     'Passenger fare',
                                     'Port of Embarkation', 
                                     "class", 
                                     "sex",
                                     'is adult' ,
                                     'deck',
                                    'Port of Embarkation',
                                    'Survival',
                                    'is alone']
datadict

Unnamed: 0,dtypes,definations
survived,int64,Survival
pclass,int64,A proxy for socio-economic status (SES)
sex,int8,Sex
age,float64,Age in years
sibsp,int64,"# of siblings(brother, sister) and spouses ( h..."
parch,int64,# of parents / children aboard the Titanic
fare,float64,Passenger fare
embarked,category,Port of Embarkation
class,category,class
who,category,sex


* #### check missing values

In [215]:
datadict['MissingVal'] = df.isnull().sum()
datadict

Unnamed: 0,dtypes,definations,MissingVal
survived,int64,Survival,0
pclass,int64,A proxy for socio-economic status (SES),0
sex,int8,Sex,0
age,float64,Age in years,177
sibsp,int64,"# of siblings(brother, sister) and spouses ( h...",0
parch,int64,# of parents / children aboard the Titanic,0
fare,float64,Passenger fare,0
embarked,category,Port of Embarkation,2
class,category,class,0
who,category,sex,0


* #### check number of unique values

In [216]:
datadict['NUnique']=df.nunique()
datadict

Unnamed: 0,dtypes,definations,MissingVal,NUnique
survived,int64,Survival,0,2
pclass,int64,A proxy for socio-economic status (SES),0,3
sex,int8,Sex,0,2
age,float64,Age in years,177,88
sibsp,int64,"# of siblings(brother, sister) and spouses ( h...",0,7
parch,int64,# of parents / children aboard the Titanic,0,7
fare,float64,Passenger fare,0,248
embarked,category,Port of Embarkation,2,3
class,category,class,0,3
who,category,sex,0,3


* #### checkcount of each column 

In [217]:
datadict['Count']=df.count()
datadict

Unnamed: 0,dtypes,definations,MissingVal,NUnique,Count
survived,int64,Survival,0,2,891
pclass,int64,A proxy for socio-economic status (SES),0,3,891
sex,int8,Sex,0,2,891
age,float64,Age in years,177,88,714
sibsp,int64,"# of siblings(brother, sister) and spouses ( h...",0,7,891
parch,int64,# of parents / children aboard the Titanic,0,7,891
fare,float64,Passenger fare,0,248,891
embarked,category,Port of Embarkation,2,3,889
class,category,class,0,3,891
who,category,sex,0,3,891


## Quick look at descriptive statistics

In [218]:
df.describe(include='number').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
sex,891.0,0.647587,0.47799,0.0,0.0,1.0,1.0,1.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
alive,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0


> data is reletivley imbalanced (see survived mean)

In [219]:
df.describe(include='category').T

Unnamed: 0,count,unique,top,freq
embarked,889,3,S,644
class,891,3,Third,491
who,891,3,man,537
deck,203,7,C,59
embark_town,889,3,Southampton,644


In [235]:
(df.embark_town.str.get(0) == df.embarked).all()

False

In [236]:
df.loc[df.embark_town.str.get(0) != df.embarked]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
61,1,1,0,38.0,0,0,80.0,,First,woman,False,B,,1,True
829,1,1,0,62.0,0,0,80.0,,First,woman,False,B,,1,True


In [238]:
df.drop("embark_town", axis=1, inplace= True)

## discover the data to get insights

In [220]:
# copy data to avoid corrept the data
titanic = df.copy()

In [221]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,S,Third,man,True,,Southampton,0,False
1,1,1,0,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,1,False
2,1,3,0,26.0,0,0,7.925,S,Third,woman,False,,Southampton,1,True
3,1,1,0,35.0,1,0,53.1,S,First,woman,False,C,Southampton,1,False
4,0,3,1,35.0,0,0,8.05,S,Third,man,True,,Southampton,0,True


## looking for correlations

In [222]:
titanic.corr(numeric_only= True)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alive,alone
survived,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,1.0,-0.203367
pclass,-0.338481,1.0,0.1319,-0.369226,0.083081,0.018443,-0.5495,0.094035,-0.338481,0.135207
sex,-0.543351,0.1319,1.0,0.093254,-0.114631,-0.245489,-0.182333,0.908578,-0.543351,0.303646
age,-0.077221,-0.369226,0.093254,1.0,-0.308247,-0.189119,0.096067,0.280328,-0.077221,0.19827
sibsp,-0.035322,0.083081,-0.114631,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.035322,-0.584471
parch,0.081629,0.018443,-0.245489,-0.189119,0.414838,1.0,0.216225,-0.349943,0.081629,-0.583398
fare,0.257307,-0.5495,-0.182333,0.096067,0.159651,0.216225,1.0,-0.182024,0.257307,-0.271832
adult_male,-0.55708,0.094035,0.908578,0.280328,-0.253586,-0.349943,-0.182024,1.0,-0.55708,0.404744
alive,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,1.0,-0.203367
alone,-0.203367,0.135207,0.303646,0.19827,-0.584471,-0.583398,-0.271832,0.404744,-0.203367,1.0


In [223]:
corrdf = titanic.corr(numeric_only= True)
corrdf.style.background_gradient(cmap="Greens")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alive,alone
survived,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,1.0,-0.203367
pclass,-0.338481,1.0,0.1319,-0.369226,0.083081,0.018443,-0.5495,0.094035,-0.338481,0.135207
sex,-0.543351,0.1319,1.0,0.093254,-0.114631,-0.245489,-0.182333,0.908578,-0.543351,0.303646
age,-0.077221,-0.369226,0.093254,1.0,-0.308247,-0.189119,0.096067,0.280328,-0.077221,0.19827
sibsp,-0.035322,0.083081,-0.114631,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.035322,-0.584471
parch,0.081629,0.018443,-0.245489,-0.189119,0.414838,1.0,0.216225,-0.349943,0.081629,-0.583398
fare,0.257307,-0.5495,-0.182333,0.096067,0.159651,0.216225,1.0,-0.182024,0.257307,-0.271832
adult_male,-0.55708,0.094035,0.908578,0.280328,-0.253586,-0.349943,-0.182024,1.0,-0.55708,0.404744
alive,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,1.0,-0.203367
alone,-0.203367,0.135207,0.303646,0.19827,-0.584471,-0.583398,-0.271832,0.404744,-0.203367,1.0


In [224]:
corrdf.style.highlight_min(axis=0)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alive,alone
survived,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,1.0,-0.203367
pclass,-0.338481,1.0,0.1319,-0.369226,0.083081,0.018443,-0.5495,0.094035,-0.338481,0.135207
sex,-0.543351,0.1319,1.0,0.093254,-0.114631,-0.245489,-0.182333,0.908578,-0.543351,0.303646
age,-0.077221,-0.369226,0.093254,1.0,-0.308247,-0.189119,0.096067,0.280328,-0.077221,0.19827
sibsp,-0.035322,0.083081,-0.114631,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.035322,-0.584471
parch,0.081629,0.018443,-0.245489,-0.189119,0.414838,1.0,0.216225,-0.349943,0.081629,-0.583398
fare,0.257307,-0.5495,-0.182333,0.096067,0.159651,0.216225,1.0,-0.182024,0.257307,-0.271832
adult_male,-0.55708,0.094035,0.908578,0.280328,-0.253586,-0.349943,-0.182024,1.0,-0.55708,0.404744
alive,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,1.0,-0.203367
alone,-0.203367,0.135207,0.303646,0.19827,-0.584471,-0.583398,-0.271832,0.404744,-0.203367,1.0


In [225]:
mask = np.eye(len(corrdf), dtype=bool)
corrdf.values[mask] = np.nan


In [226]:
corrdf.style.highlight_max(axis=0)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alive,alone
survived,,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,1.0,-0.203367
pclass,-0.338481,,0.1319,-0.369226,0.083081,0.018443,-0.5495,0.094035,-0.338481,0.135207
sex,-0.543351,0.1319,,0.093254,-0.114631,-0.245489,-0.182333,0.908578,-0.543351,0.303646
age,-0.077221,-0.369226,0.093254,,-0.308247,-0.189119,0.096067,0.280328,-0.077221,0.19827
sibsp,-0.035322,0.083081,-0.114631,-0.308247,,0.414838,0.159651,-0.253586,-0.035322,-0.584471
parch,0.081629,0.018443,-0.245489,-0.189119,0.414838,,0.216225,-0.349943,0.081629,-0.583398
fare,0.257307,-0.5495,-0.182333,0.096067,0.159651,0.216225,,-0.182024,0.257307,-0.271832
adult_male,-0.55708,0.094035,0.908578,0.280328,-0.253586,-0.349943,-0.182024,,-0.55708,0.404744
alive,1.0,-0.338481,-0.543351,-0.077221,-0.035322,0.081629,0.257307,-0.55708,,-0.203367
alone,-0.203367,0.135207,0.303646,0.19827,-0.584471,-0.583398,-0.271832,0.404744,-0.203367,


In [227]:
corrdf["survived"].abs().sort_values(ascending=False)

alive         1.000000
adult_male    0.557080
sex           0.543351
pclass        0.338481
fare          0.257307
alone         0.203367
parch         0.081629
age           0.077221
sibsp         0.035322
survived           NaN
Name: survived, dtype: float64

survived and alive are same so we dont need alive column in our dataset

In [228]:
titanic.drop("alive", axis="columns", inplace=True)

In [229]:
datadict.loc["adult_male"]

dtypes             bool
definations    is adult
MissingVal            0
NUnique               2
Count               891
Name: adult_male, dtype: object

In [230]:
corrdf["fare"].abs().sort_values(ascending=False)

pclass        0.549500
alone         0.271832
survived      0.257307
alive         0.257307
parch         0.216225
sex           0.182333
adult_male    0.182024
sibsp         0.159651
age           0.096067
fare               NaN
Name: fare, dtype: float64