In [1]:
# importing packages
import string
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell

# setting package beheaviors
## set Seaborn style
sns.set(style = "darkgrid")
## enable multiple output by IPython core
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# reading in data
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

## sort = True will sort column in alphabetic order. default: False
## df.rest_index will add a sequential vector as the new index, old index will be added as a column.
## use drop = True to drop the old index.
df_all = pd.concat([df_train, df_test], sort = True).reset_index(drop = True)

df_train.name = "Training Set"
df_test.name = "Test Set"
df_all.name = "All Set"

print("Shape of Training Set: {}".format(df_train.shape))
print("Shape of Test Set: {}".format(df_test.shape))
# df_train.head(10)

# data_frame.sample() is a function that returns samples of given size from the dataFrame.
# can use random_state to control the internal random process and reproduce the sampling
# process.

df_train.sample(10)

## general info about the data (types and null info)
df_train.info()

## statistical basics
df_train.describe()

## memory usage (measured by bytes) 
## (vitually unnecessary cause memory usage are included in df.info() also)
df_train.memory_usage()

Shape of Training Set: (891, 12)
Shape of Test Set: (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
613,614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q
738,739,0,3,"Ivanoff, Mr. Kanio",male,,0,0,349201,7.8958,,S
476,477,0,2,"Renouf, Mr. Peter Henry",male,34.0,1,0,31027,21.0,,S
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0,,S
387,388,1,2,"Buss, Miss. Kate",female,36.0,0,0,27849,13.0,,S
354,355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C
72,73,0,2,"Hood, Mr. Ambrose Jr",male,21.0,0,0,S.O.C. 14879,73.5,,S
621,622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42.0,1,0,11753,52.5542,D19,S
197,198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42.0,0,1,4579,8.4042,,S
138,139,0,3,"Osen, Mr. Olaf Elon",male,16.0,0,0,7534,9.2167,,S


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Index           128
PassengerId    7128
Survived       7128
Pclass         7128
Name           7128
Sex            7128
Age            7128
SibSp          7128
Parch          7128
Ticket         7128
Fare           7128
Cabin          7128
Embarked       7128
dtype: int64

## About pandas.loc

+ Stand for "label based locating"
+ **contrary to usual python slices, both the start and the stop are included**
+ Allowed inputs are:

    + A single label, e.g. 5 or 'a', (note that 5 is interpreted as a label of the index, and never as an integer position along the index).

    + A list or array of labels, e.g. ['a', 'b', 'c'].

    + A slice object with labels, e.g. 'a':'f'.

In [3]:
# Helper Functions

def get_all_df(df1, df2):
    """Return a concatenated dataFrame."""
    return pd.concat([df1, df2], sort = True).reset_index(drop = True)

def get_original_df(df, df1, df2):
    """Return a list containing two original dataFrame."""
    return [
        df.loc[0:df1.shape[0] - 1],
        df.loc[df1.shape[0]:]
    ]

In [4]:
test1 = get_all_df(df_train, df_test)
test1.info()

[test2, test3] = get_original_df(test1, df_train, df_test)
test2.info()
test3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1046 non-null   float64
 1   Cabin        295 non-null    object 
 2   Embarked     1307 non-null   object 
 3   Fare         1308 non-null   float64
 4   Name         1309 non-null   object 
 5   Parch        1309 non-null   int64  
 6   PassengerId  1309 non-null   int64  
 7   Pclass       1309 non-null   int64  
 8   Sex          1309 non-null   object 
 9   SibSp        1309 non-null   int64  
 10  Survived     891 non-null    float64
 11  Ticket       1309 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          714 non-null    float64
 1   Cabin

## Explore df.column

df.column is a special attribute of df. It contains names of all attributes (or so called column names) in a dataframe. All data are collected in type df.Index, which is similar to df.Series

Methods like tolist() (derived from numpy) can be used to convert df.Index to type List. It is not nessesary to convert df.Index to List in order to iterate the data, however, df.Index is not promised to be iterable in future implementation.(Although it is iterable right now), So it is recommended to use a conversion before iterating it.

In [5]:
# explore df.column
df_train.columns
print(type(df_train.columns))
df_train.columns.tolist()
print(type(df_train.columns.tolist()))

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

<class 'pandas.core.indexes.base.Index'>


['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

<class 'list'>


## Detecting corrupted value and summation

### Corrupted values

isnull() method can be used to detect null values in dataframe or series. This is particularly useful.

**isna() is equivalant to isnull() method. De facto, isnull is an alias for isna() method.**

When applied to a df, the return value will be a dataframe, and a series when applied to a seried.

**In Pandas, all corrupted values are represented using NaN (derived form np.NaN), and ''(void string) or np.inf are not considered as one of them. use pandas.options.mode.use_inf_as_na = True) if you wish to alter this beheavior.**

### Summation

sum() can be applied to calulate summation in Pandas.

A trick to get the total number of NaNs in a Series is to use

`series.isnull().sum()`

because when calculating Boolean, python interpretor automatically treate True as 1 and False as 0.

In [6]:
df_train[df_train.columns.tolist()[0]].isnull()
# df_train.isnull()

pd.Series([0,False,1.1,True]).sum()
True + True

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: PassengerId, Length: 891, dtype: bool

2.1

2

In [7]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
df_train.corr().abs()
df_train.columns.size
df_test.columns.size

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,0.005007,0.035144,0.036847,0.057527,0.001652,0.012658
Survived,0.005007,1.0,0.338481,0.077221,0.035322,0.081629,0.257307
Pclass,0.035144,0.338481,1.0,0.369226,0.083081,0.018443,0.5495
Age,0.036847,0.077221,0.369226,1.0,0.308247,0.189119,0.096067
SibSp,0.057527,0.035322,0.083081,0.308247,1.0,0.414838,0.159651
Parch,0.001652,0.081629,0.018443,0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,0.5495,0.096067,0.159651,0.216225,1.0


12

11

## Handling Missing Values

### About some functions

#### unstack()

In [9]:
df_all.corr().abs()
df_all_corr = df_all.corr().abs().unstack()
# df_all_corr.rename(columns={"level_0": "Featured 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_all_corr

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
Age,1.0,0.17874,0.150917,0.028814,0.408106,0.243699,0.077221
Fare,0.17874,1.0,0.221539,0.031428,0.558629,0.160238,0.257307
Parch,0.150917,0.221539,1.0,0.008942,0.018322,0.373587,0.081629
PassengerId,0.028814,0.031428,0.008942,1.0,0.038354,0.055224,0.005007
Pclass,0.408106,0.558629,0.018322,0.038354,1.0,0.060832,0.338481
SibSp,0.243699,0.160238,0.373587,0.055224,0.060832,1.0,0.035322
Survived,0.077221,0.257307,0.081629,0.005007,0.338481,0.035322,1.0


Age          Age            1.000000
             Fare           0.178740
             Parch          0.150917
             PassengerId    0.028814
             Pclass         0.408106
             SibSp          0.243699
             Survived       0.077221
Fare         Age            0.178740
             Fare           1.000000
             Parch          0.221539
             PassengerId    0.031428
             Pclass         0.558629
             SibSp          0.160238
             Survived       0.257307
Parch        Age            0.150917
             Fare           0.221539
             Parch          1.000000
             PassengerId    0.008942
             Pclass         0.018322
             SibSp          0.373587
             Survived       0.081629
PassengerId  Age            0.028814
             Fare           0.031428
             Parch          0.008942
             PassengerId    1.000000
             Pclass         0.038354
             SibSp          0.055224
 