In [1]:
# importing packages
import string
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell

# setting package beheaviors
## set Seaborn style
sns.set(style = "darkgrid")
## enable multiple output by IPython core
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# reading in data
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
df_all = pd.concat([df_train, df_test], sort = True).reset_index(drop = True)

df_train.name = "Training Set"
df_test.name = "Test Set"
df_all.name = "All Set"

print("Shape of Training Set: {}".format(df_train.shape))
print("Shape of Test Set: {}".format(df_test.shape))
# df_train.head(10)

# data_frame.sample() is a function that returns samples of given size from the dataFrame.
# can use random_state to control the internal random process and reproduce the sampling
# process.
df_train.sample(10)

## general info about the data (types and null info)
df_train.info()

## statistical basics
df_train.describe()

## memory usage (measured by bytes) 
## (vitually unnecessary cause memory usage are included in df.info() also)
# df_train.memory_usage()

Shape of Training Set: (891, 12)
Shape of Test Set: (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
472,473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33.0,1,2,C.A. 34651,27.75,,S
305,306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
71,72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S
548,549,0,3,"Goldsmith, Mr. Frank John",male,33.0,1,1,363291,20.525,,S
427,428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louis...",female,19.0,0,0,250655,26.0,,S
815,816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0.0,B102,S
97,98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,D10 D12,C
257,258,1,1,"Cherry, Miss. Gladys",female,30.0,0,0,110152,86.5,B77,S
777,778,1,3,"Emanuel, Miss. Virginia Ethel",female,5.0,0,0,364516,12.475,,S
284,285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26.0,A19,S


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Explore df.column

df.column is a special attribute of df. It contains names of all attributes (or so called column names) in a dataframe. All data are collected in type df.Index, which is similar to df.Series

Methods like tolist() (derived from numpy) can be used to convert df.Index to type List. It is not nessesary to convert df.Index to List in order to iterate the data, however, df.Index is not promised to be iterable in future implementation.(Although it is iterable right now), So it is recommended to use a conversion before iterating it.

In [3]:
# explore df.column
df_train.columns
print(type(df_train.columns))
df_train.columns.tolist()
print(type(df_train.columns.tolist()))

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

<class 'pandas.core.indexes.base.Index'>


['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

<class 'list'>


## Detecting corrupted value and summation

### Corrupted values

isnull() method can be used to detect null values in dataframe or series. This is particularly useful.

**isna() is equivalant to isnull() method. De facto, isnull is an alias for isna() method.**

When applied to a df, the return value will be a dataframe, and a series when applied to a seried.

**In Pandas, all corrupted values are represented using NaN (derived form np.NaN), and ''(void string) or np.inf are not considered as one of them. use pandas.options.mode.use_inf_as_na = True) if you wish to alter this beheavior.**

### Summation

sum() can be applied to calulate summation in Pandas.

A trick to get the total number of NaNs in a Series is to use

`series.isnull().sum()`

because when calculating Boolean, python interpretor automatically treate True as 1 and False as 0.

In [4]:
df_train[df_train.columns.tolist()[0]].isnull()
# df_train.isnull()

df_train.isnull().sum()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: PassengerId, Length: 891, dtype: bool

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
pd.Series([0,False,1.1,True]).sum()

True + True

2.1

2