## Dataframe Manipulation in Pandas

In [118]:
import numpy as np 
import pandas as pd 

df = pd.read_csv('titanic.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## Top five rows of the dataframe for 'Survived', 'Sex', 'Age' columns

In [5]:
df[['Survived', 'Sex', 'Age']].head(5)

Unnamed: 0,Survived,Sex,Age
0,0,male,22.0
1,1,female,38.0
2,1,female,26.0
3,1,female,35.0
4,0,male,35.0


## Top five rows based on sliced dataframe for all female

In [7]:
df[df['Sex'] == 'female'].head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## How many null values we have for 'Age' column?

In [8]:
df['Age'].isna().sum()

177

## Slice the dataframe when there is no null values for 'Age' column?

In [13]:
df[df['Age'].notnull()].head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## What are the unique values we have for 'Embarked' column?

In [14]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

## How many of the these 'S', 'C', 'Q' unique values of 'Embarked' we have in the dataframe?

In [119]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [120]:
df.groupby('Embarked').count()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C,168,168,168,168,168,130,168,168,168,168,69
Q,77,77,77,77,77,28,77,77,77,77,4
S,644,644,644,644,644,554,644,644,644,644,129


In [55]:
for i,j in df.groupby('Embarked')['Sex']:
    print((i, list(j[0:10]),'...', len(j)))

('C', ['female', 'female', 'female', 'male', 'male', 'female', 'male', 'male', 'female', 'male'], '...', 168)
('Q', ['male', 'male', 'female', 'female', 'female', 'female', 'male', 'female', 'female', 'female'], '...', 77)
('S', ['male', 'female', 'female', 'male', 'male', 'male', 'female', 'female', 'female', 'male'], '...', 644)


## Get the 10 first values of 'Embarked'

In [47]:
df['Embarked'].values[:10]

array(['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'C'], dtype=object)

## Get the 5 first values of 'Embarked' and 'Sex'

In [49]:
df[['Embarked', 'Sex']].values[:5]

array([['S', 'male'],
       ['C', 'female'],
       ['S', 'female'],
       ['S', 'female'],
       ['S', 'male']], dtype=object)

## Obtain the min, mean and max values of 'Age'

In [50]:
df['Age'].min()

0.42

In [51]:
df['Age'].mean()

29.69911764705882

In [52]:
df['Age'].max()

80.0

## Obtain the min, mean and max values of 'Age' for only male passengers

In [56]:
df[df['Sex']== 'male']['Age'].min()

0.42

In [57]:
df[df['Sex']== 'male']['Age'].max()

80.0

In [58]:
df[df['Sex']== 'male']['Age'].mean()

30.72664459161148

In [62]:
# Simple example to see how groupby works
for i,j in df[df['Embarked'] == 'C'].groupby('Sex')['Sex']:
    print((i, list(j), len(j)))

('female', ['female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female'], 73)
('male', ['male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'ma

## What is the 'Fare' based on Sex for passengers in 'Embarked' section C?

In [63]:
for i,j in df[df['Embarked'] == 'C'].groupby('Sex')['Fare']:
    print((i, list(j), len(j)))

('female', [71.2833, 30.0708, 7.225, 146.5208, 11.2417, 41.5792, 76.7292, 14.4542, 14.4583, 22.3583, 15.2458, 28.7125, 27.7208, 146.5208, 113.275, 76.2917, 14.4542, 15.2458, 79.2, 512.3292, 91.0792, 247.5208, 110.8833, 108.9, 56.9292, 83.1583, 262.375, 134.5, 135.6333, 57.9792, 134.5, 14.4542, 75.25, 7.2292, 69.3, 82.1708, 227.525, 15.7417, 12.0, 113.275, 19.2583, 19.2583, 13.7917, 78.2667, 59.4, 57.9792, 22.3583, 106.425, 49.5, 39.6, 14.4583, 110.8833, 78.2667, 41.5792, 69.3, 19.2583, 13.4167, 227.525, 14.4542, 49.5042, 227.525, 262.375, 7.2292, 14.4542, 83.1583, 31.0, 89.1042, 15.2458, 19.2583, 13.8583, 24.0, 7.225, 83.1583], 73)
('male', [7.225, 27.7208, 82.1708, 7.2292, 7.8958, 21.6792, 61.9792, 7.2292, 7.2292, 27.7208, 15.2458, 14.4542, 34.6542, 63.3583, 247.5208, 30.0708, 11.2417, 7.8958, 15.0458, 79.2, 61.3792, 30.6958, 15.05, 7.225, 18.7875, 31.0, 7.225, 29.7, 8.6625, 12.875, 27.7208, 7.2292, 24.0, 7.2292, 7.225, 27.7208, 55.4417, 135.6333, 211.5, 4.0125, 7.8958, 27.75, 89.1042

## What is the 'Age' based on Sex for passengers in 'Embarked' section C?

In [69]:
#df.filter(df['Embarked'] == 'C').select(['Sex', 'Age']).groupby('Sex').agg(F.collect_list('Age')).show()

for i,j in df[df['Embarked'] == 'C'].groupby('Sex')['Age']:
    print((i, list(j), len(j)))

('female', [38.0, 14.0, nan, nan, 14.0, 3.0, 49.0, 14.5, 17.0, nan, nan, 50.0, 44.0, 58.0, 31.0, 32.0, nan, 29.0, nan, 35.0, 19.0, 50.0, nan, 17.0, 30.0, 24.0, 18.0, 40.0, 36.0, 16.0, 41.0, 45.0, 60.0, nan, 24.0, nan, 42.0, 1.0, 17.0, 23.0, 5.0, 0.75, 23.0, 54.0, 54.0, 44.0, nan, 30.0, 22.0, 48.0, nan, 39.0, 52.0, 22.0, 24.0, 0.75, 4.0, 18.0, 18.0, 24.0, 38.0, 21.0, 13.0, 15.0, 39.0, 30.0, nan, 9.0, 24.0, 27.0, 28.0, 15.0, 56.0], 73)
('male', [nan, 40.0, 28.0, nan, nan, nan, 65.0, 28.5, 22.0, nan, nan, 26.0, 71.0, 23.0, 24.0, 32.5, 12.0, 33.0, 23.0, 24.0, 51.0, 56.0, nan, 45.5, 26.0, 40.0, 30.0, 37.0, 33.0, 36.0, nan, 23.5, 30.0, 15.0, nan, 29.0, 25.0, 22.0, 27.0, 20.0, nan, 30.0, 49.0, 29.0, 25.0, 58.0, 71.0, nan, 18.0, nan, nan, nan, 17.0, 50.0, nan, 17.0, 22.0, nan, nan, 36.0, nan, 60.0, nan, 49.0, 35.0, 27.0, 20.0, 32.0, 48.0, 56.0, 58.0, 40.0, 36.0, 27.0, 25.0, 25.0, 49.0, nan, 11.0, 35.0, 20.0, nan, nan, 46.0, nan, 30.0, 0.42, 31.0, 1.0, nan, nan, 34.5, 35.0, nan, 26.0], 95)


## What is the 'Age' based on Sex for passengers in 'Embarked' section C? (remove nan values for 'Age')

In [102]:
from collections import defaultdict

d = defaultdict(list)

for i,j in df[df['Embarked'] == 'C'].groupby('Sex')['Age']:
    for age in j:
        if np.isnan(age) == False:
            d[i].append(age)

print(d['female'])

[38.0, 14.0, 14.0, 3.0, 49.0, 14.5, 17.0, 50.0, 44.0, 58.0, 31.0, 32.0, 29.0, 35.0, 19.0, 50.0, 17.0, 30.0, 24.0, 18.0, 40.0, 36.0, 16.0, 41.0, 45.0, 60.0, 24.0, 42.0, 1.0, 17.0, 23.0, 5.0, 0.75, 23.0, 54.0, 54.0, 44.0, 30.0, 22.0, 48.0, 39.0, 52.0, 22.0, 24.0, 0.75, 4.0, 18.0, 18.0, 24.0, 38.0, 21.0, 13.0, 15.0, 39.0, 30.0, 9.0, 24.0, 27.0, 28.0, 15.0, 56.0]


In [121]:
print(len(d['female']))

61


In [122]:
print(d)

defaultdict(<class 'list'>, {'female': [38.0, 14.0, 14.0, 3.0, 49.0, 14.5, 17.0, 50.0, 44.0, 58.0, 31.0, 32.0, 29.0, 35.0, 19.0, 50.0, 17.0, 30.0, 24.0, 18.0, 40.0, 36.0, 16.0, 41.0, 45.0, 60.0, 24.0, 42.0, 1.0, 17.0, 23.0, 5.0, 0.75, 23.0, 54.0, 54.0, 44.0, 30.0, 22.0, 48.0, 39.0, 52.0, 22.0, 24.0, 0.75, 4.0, 18.0, 18.0, 24.0, 38.0, 21.0, 13.0, 15.0, 39.0, 30.0, 9.0, 24.0, 27.0, 28.0, 15.0, 56.0], 'male': [40.0, 28.0, 65.0, 28.5, 22.0, 26.0, 71.0, 23.0, 24.0, 32.5, 12.0, 33.0, 23.0, 24.0, 51.0, 56.0, 45.5, 26.0, 40.0, 30.0, 37.0, 33.0, 36.0, 23.5, 30.0, 15.0, 29.0, 25.0, 22.0, 27.0, 20.0, 30.0, 49.0, 29.0, 25.0, 58.0, 71.0, 18.0, 17.0, 50.0, 17.0, 22.0, 36.0, 60.0, 49.0, 35.0, 27.0, 20.0, 32.0, 48.0, 56.0, 58.0, 40.0, 36.0, 27.0, 25.0, 25.0, 49.0, 11.0, 35.0, 20.0, 46.0, 30.0, 0.42, 31.0, 1.0, 34.5, 35.0, 26.0]})


## Or

In [114]:
df[df['Embarked'] == 'C'].groupby('Sex')['Age'].agg(lambda x: [i for i in x if np.isnan(i) == False])

Sex
female    [38.0, 14.0, 14.0, 3.0, 49.0, 14.5, 17.0, 50.0...
male      [40.0, 28.0, 65.0, 28.5, 22.0, 26.0, 71.0, 23....
Name: Age, dtype: object

In [115]:
df[df['Embarked'] == 'C'].groupby('Sex')['Age'].agg(lambda x: len([i for i in x if np.isnan(i) == False]))

Sex
female    61.0
male      69.0
Name: Age, dtype: float64