### Titanic Dataset Analysis
This notebook demonstrates how to summarize and analyze the Titanic dataset using Pandas.

### Importing Libraries and Dataset
We start by importing the necessary libraries and creating a sample Titanic dataset.

In [None]:
#@title Load the dataframe
# https://github.com/datasciencedojo/datasets/blob/master/titanic.csv
import pandas as pd

# read the csv into pandas dataframe (csv is uploaded to colab)
#df = pd.read_csv('titanic.csv')

# read the CSV directly from web
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv')


display(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# 1.Columns: Categorical Data: Aggregation

In [None]:
#@title categorical column: get the unique values
#df['column_name'].unique()
unique_values = df['Embarked'].unique()
print(unique_values)

['S' 'C' 'Q' nan]


In [None]:
#@title categorical column: get the count of unique values
#df['column_name'].nunique()
df['Embarked'].nunique()

3

In [None]:
#@title categorical column: value_counts
df['Embarked'].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,644
C,168
Q,77


In [None]:
#@title categorical column: Percentage Distribution
# Calculate the proportion of each category.
df['Embarked'].value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
Embarked,Unnamed: 1_level_1
S,72.440945
C,18.897638
Q,8.661417


In [None]:
#@title categorical column: Find the mode
# Most frequent category
df['Embarked'].mode()[0]


Unnamed: 0,Embarked
0,S


In [None]:
#@title categorical column: Find the count of missing values
missing_age = df['Age'].isnull().sum()
missing_embarked = df['Embarked'].isnull().sum()
missing_cabin = df['Cabin'].isnull().sum()

print(f"missing_age = {missing_age}, missing_embarked = {missing_embarked}, missing_cabin = {missing_cabin}")

missing_age = 177, missing_embarked = 2, missing_cabin = 687


# 2.Columns: Quantitative Data: Aggregation

In [None]:
#@title Basic descriptive statistics
mean_f = df['Fare'].mean()
median_f = df['Fare'].median()
mode_f = df['Fare'].mode()[0]
count_f1 = len(df['Fare'])
count_f2 = df['Fare'].count()
print(f"Mean: {mean_f}, Median: {median_f}, Mode: {mode_f}, Count1: {count_f1}, Count2: {count_f2}")


Mean: 32.204207968574636, Median: 14.4542, Mode: 8.05, Count1: 891, Count2: 891


2.308641975308642

In [None]:
#@title Measures of Dispersion
max_fare = df['Fare'].max()
min_fare = df['Fare'].min()
std_fare = df['Fare'].std()
var_fare = df['Fare'].var()
range_fare = df['Fare'].max() - df['Fare'].min()

print(max_fare, min_fare, std_fare, var_fare, range_fare)
# You are walking in a lake / river
# the average depth is 2 feet
# Is it safe to cross the river on foot?

# 10, 1, 1, 1, 1, 1  (river)
# 2, 2, 2, 1, 3 (river)

512.3292 0.0 49.693428597180905 2469.436845743117 512.3292


In [None]:
#@title Percentiles /  Interquartile range (IQR)
iqr = df['Fare'].quantile(0.75) - df['Fare'].quantile(0.25)
df['Fare'].quantile([0.25, 0.5, 0.75])  # 25th, 50th, and 75th percentiles

Unnamed: 0,Fare
0.25,7.9104
0.5,14.4542
0.75,31.0


In [None]:
#@title All Summaries in one go
df['Fare'].describe()

Unnamed: 0,Fare
count,891.0
mean,32.204208
std,49.693429
min,0.0
25%,7.9104
50%,14.4542
75%,31.0
max,512.3292


In [None]:
#@title Count of non-null values
df['Fare'].count()

891

In [None]:
#@title Skewness (asymmetry of data distribution)
# What is Skewness?
# Skewness is a measure of the asymmetry of the distribution of values in a dataset. It gives you an idea of whether the data is symmetrically distributed or if it leans (or "skews") to one side.

# Positive skew (right-skewed): The tail on the right side of the distribution is longer or fatter. This means there are some larger values pulling the mean to the right.
# Negative skew (left-skewed): The tail on the left side of the distribution is longer or fatter. This means there are some smaller values pulling the mean to the left.
# Zero skewness: The distribution is perfectly symmetric.

df['Fare'].skew()

4.787316519674893

In [None]:
#@title Kurtosis (distribution of the tail)
# Kurtosis is a statistical measure that describes the shape of a distribution's tails in relation to its overall shape. Specifically, it measures how heavy or light the tails of the distribution are compared to a normal distribution.

# High kurtosis (leptokurtic): Indicates heavy tails (more outliers). The distribution has sharp peaks and thick tails.
# Low kurtosis (platykurtic): Indicates light tails (fewer outliers). The distribution has a flatter peak and thinner tails.
# Normal kurtosis (mesokurtic): A normal distribution has a kurtosis of 3.
df['Fare'].kurt()

33.39814088089868

In [None]:
#@title Count of missing values
df['Fare'].isnull().sum()

0

In [None]:
#@title Count of zero values
(df['Fare'] == 0).sum()

15

In [None]:
#@title Count of non-zero values
(df['Fare'] != 0).sum()

876

In [None]:
#@title Sum of a column
df['Fare'].sum()


28693.9493

In [None]:
#@title Cumulative Sum
df['Fare'].cumsum()

Unnamed: 0,Fare
0,7.2500
1,78.5333
2,86.4583
3,139.5583
4,147.6083
...,...
886,28602.7493
887,28632.7493
888,28656.1993
889,28686.1993


In [None]:
#@title Product of a column
df['Fare'].prod()

0.0

In [None]:
#@title Cumulative Product of a column
df['Fare'].cumprod()

Unnamed: 0,Fare
0,7.250000e+00
1,5.168039e+02
2,4.095671e+03
3,2.174801e+05
4,1.750715e+06
...,...
886,0.000000e+00
887,0.000000e+00
888,0.000000e+00
889,0.000000e+00


In [None]:
#@title Rank a column
#df['Fare_rank'] = df['Fare'].rank()
#display(df)
# smaller value - lower rank
df['Fare'].rank()

Unnamed: 0,Fare
0,77.0
1,789.0
2,232.5
3,748.0
4,264.0
...,...
886,407.5
887,654.5
888,546.5
889,654.5


In [None]:
#@title Custom aggregations
df['Fare'].agg(['mean', 'median', 'max', 'min'])

Unnamed: 0,Fare
mean,32.204208
median,14.4542
max,512.3292
min,0.0


In [None]:
#@title Comprehensive aggregations
df['Fare'].agg({
    'Mean': 'mean',
    'Median': 'median',
    'Max': 'max',
    'Min': 'min',
    'Range': lambda x: x.max() - x.min(),
    'IQR': lambda x: x.quantile(0.75) - x.quantile(0.25),
    'Skewness': 'skew',
    'Outliers': lambda x: ((x < x.quantile(0.25) - 1.5 * (x.quantile(0.75) - x.quantile(0.25))) |
                           (x > x.quantile(0.75) + 1.5 * (x.quantile(0.75) - x.quantile(0.25)))).sum()
})

Unnamed: 0,Fare
Mean,32.204208
Median,14.4542
Max,512.3292
Min,0.0
Range,512.3292
IQR,23.0896
Skewness,4.787317
Outliers,116.0


#3.Row-wise Aggregataions (axis = 1)

In [None]:
#@title mean of rows
# Doesn't work on titanic
df.mean(axis=1)

TypeError: unsupported operand type(s) for +: 'int' and 'str'

# 4.Summarization at Dataframe Level

In [None]:
#@title Number of rows and columns
df.shape

(891, 12)

In [None]:
#@title Number of cells
df.size

10692

In [None]:
#@title Missing Value Counts
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [None]:
#@title Missing Value Percentages
df.isnull().mean() * 100

Unnamed: 0,0
PassengerId,0.0
Survived,0.0
Pclass,0.0
Name,0.0
Sex,0.0
Age,19.86532
SibSp,0.0
Parch,0.0
Ticket,0.0
Fare,0.0


In [None]:
#@title Count of Non-Missing Values per Column
df.count()

Unnamed: 0,0
PassengerId,891
Survived,891
Pclass,891
Name,891
Sex,891
Age,714
SibSp,891
Parch,891
Ticket,891
Fare,891


In [None]:
#@title Mean of numerical columns
df.mean(numeric_only=True)

Unnamed: 0,0
PassengerId,446.0
Survived,0.383838
Pclass,2.308642
Age,29.699118
SibSp,0.523008
Parch,0.381594
Fare,32.204208


In [None]:
#@title Median of numerical columns
df.median(numeric_only=True)

Unnamed: 0,0
PassengerId,446.0
Survived,0.0
Pclass,3.0
Age,28.0
SibSp,0.0
Parch,0.0
Fare,14.4542


In [None]:
#@title Min of numerical columns
df.min(numeric_only=True)


Unnamed: 0,0
PassengerId,1.0
Survived,0.0
Pclass,1.0
Age,0.42
SibSp,0.0
Parch,0.0
Fare,0.0


In [None]:
#@title Max of numerical columns
df.max(numeric_only=True)

Unnamed: 0,0
PassengerId,891.0
Survived,1.0
Pclass,3.0
Age,80.0
SibSp,8.0
Parch,6.0
Fare,512.3292


In [None]:
#@title Sum of numerical columns
df.sum(numeric_only=True)

Unnamed: 0,0
PassengerId,397386.0
Survived,342.0
Pclass,2057.0
Age,21205.17
SibSp,466.0
Parch,340.0
Fare,28693.9493


In [None]:
#@title Standard deviation of numerical columns
df.std(numeric_only=True)

Unnamed: 0,0
PassengerId,257.353842
Survived,0.486592
Pclass,0.836071
Age,14.526497
SibSp,1.102743
Parch,0.806057
Fare,49.693429


In [None]:
#@title Variance of Numeric Columns
df.var(numeric_only=True)

Unnamed: 0,0
PassengerId,66231.0
Survived,0.236772
Pclass,0.699015
Age,211.019125
SibSp,1.216043
Parch,0.649728
Fare,2469.436846


In [None]:
#@title Range (Max-Min) for Each Column:
df.max(numeric_only=True) - df.min(numeric_only=True)

Unnamed: 0,0
PassengerId,890.0
Survived,1.0
Pclass,2.0
Age,79.58
SibSp,8.0
Parch,6.0
Fare,512.3292


In [None]:
#@title Corelation matrix
df.corr(numeric_only=True)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [None]:
#@title Count of Unique Values for Each Column:
df.nunique()

Unnamed: 0,0
PassengerId,891
Survived,2
Pclass,3
Name,891
Sex,2
Age,88
SibSp,7
Parch,7
Ticket,681
Fare,248


In [None]:
#@title Most Frequent Value (Mode) for Each Column:
df.mode().iloc[0]

Unnamed: 0,0
PassengerId,1
Survived,0.0
Pclass,3.0
Name,"Abbing, Mr. Anthony"
Sex,male
Age,24.0
SibSp,0.0
Parch,0.0
Ticket,1601
Fare,8.05


In [None]:
#@title Cumulatie Sum across all columns
# Doesn't work if df has mixed data
#df.cumsum()

TypeError: cumsum() got an unexpected keyword argument 'numeric_only'

In [None]:
#@title Cumulatie Product across all columns
# Doesn't work if df has mixed data
#df.cumprod()

In [None]:
#@title Generate summary statistics (only numerical)
df.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292
