# Understanding Your Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic.csv')

## 1. How big is your data ?

In [3]:
df.shape

(891, 12)

## 2. How does the data look like ?

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# df.head() shows only first 5 rows sometimes lower data or middle data can differ from top order data, it is better to use df.sample() than df.head()
df.sample(5)
# shows random 5 rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
479,480,1,3,"Hirvonen, Miss. Hildur E",female,2.0,0,1,3101298,12.2875,,S
458,459,1,2,"Toomey, Miss. Ellen",female,50.0,0,0,F.C.C. 13531,10.5,,S
605,606,0,3,"Lindell, Mr. Edvard Bengtsson",male,36.0,1,0,349910,15.55,,S
148,149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26.0,F2,S
431,432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S


## 3. What is the data type of columns ?


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
# It also shows how much memory your dataset occupies. Also you can see Age can be encode as integer but it taken as float. It should be converted to 
# integer type. Not big deal for small dataset but it matters for big dataset.

## 4. Does you data contain missing values ?

In [9]:
# You can see that your dataset contains missing values or not in df.info() but if you want exact count then use 
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [10]:
# this gives number of missing values in each column.

## 5. How does your data look mathematically ?

In [11]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
# describe method give a high level mathematical summary of numerical features in your data.
# This method can give you insight about only numerical columns. Any type of anamoly within numerical columns can be detected using this method.

## 6. Are there any duplicated values in your data ?

In [14]:
df.duplicated().sum()

0

In [15]:
# gives the number of duplicate rows in your dataset.

## 7. How is the correlation between columns 

In [19]:
# correlation is a way to describe how two things are related to each other.
df['Age'].corr(df['Survived'])

-0.07722109457217764

In [20]:
# All the columns given in data are not useful for building ML model or for prediction. Correlation tells which input column doesn't make any effect
# on output column

In [23]:
# Correlation tells two types of things
# first is the how the output column is dependetn on input columns
# and second thing is how the input features are correlated with each other. When there is high correaltion between two input features
# to avoid multicollinearity we generally remove those features.
# Gives an idea that which features can play important roles