In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Series - one-dimensional array-like object that can hold any data type
population = pd.Series({
    'Germany' : 81.3,
    'Belgium' : 11.3,
    'France' : 64.3,
    'United Kingdom': 64.9,
    'Netherlands' : 16.9
})

In [3]:
population.head()

Germany           81.3
Belgium           11.3
France            64.3
United Kingdom    64.9
Netherlands       16.9
dtype: float64

In [4]:
population.shape

(5,)

In [5]:
# Data Frame - It is a two-dimensional tabular data structure that consists of rows and columns.
data = {
    'country' : ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'],
    'population' : [11.3, 64.3, 81.3, 16.9, 64.9],
    'area' : [30510, 671308, 357050, 41526, 244820],
    'capital' : ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London'],
}

In [6]:
df = pd.DataFrame(data)

In [7]:
df.head()

Unnamed: 0,country,population,area,capital
0,Belgium,11.3,30510,Brussels
1,France,64.3,671308,Paris
2,Germany,81.3,357050,Berlin
3,Netherlands,16.9,41526,Amsterdam
4,United Kingdom,64.9,244820,London


In [8]:
df.shape

(5, 4)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     5 non-null      object 
 1   population  5 non-null      float64
 2   area        5 non-null      int64  
 3   capital     5 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 288.0+ bytes


In [10]:
df.describe()

Unnamed: 0,population,area
count,5.0,5.0
mean,47.74,269042.8
std,31.519645,264012.827994
min,11.3,30510.0
25%,16.9,41526.0
50%,64.3,244820.0
75%,64.9,357050.0
max,81.3,671308.0


In [11]:
# Setting the index to the country names
df = df.set_index('country')

In [12]:
df.head()

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,11.3,30510,Brussels
France,64.3,671308,Paris
Germany,81.3,357050,Berlin
Netherlands,16.9,41526,Amsterdam
United Kingdom,64.9,244820,London


One of pandas' basic features is the labeling of rows and columns, but this makes indexing also a bit more complex compared to numpy. We now have to distuinguish between:

* selection by label <br>
* selection by position. <br>
data[] provides some convenience shortcuts <br>
For a DataFrame, basic indexing selects the columns. <br>

Selecting a single column:

In [13]:
df['area']

country
Belgium            30510
France            671308
Germany           357050
Netherlands        41526
United Kingdom    244820
Name: area, dtype: int64

In [14]:
df[['area', 'capital']]

Unnamed: 0_level_0,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,30510,Brussels
France,671308,Paris
Germany,357050,Berlin
Netherlands,41526,Amsterdam
United Kingdom,244820,London


In [15]:
# But, slicing accesses the rows:
df['Belgium':'France']

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,11.3,30510,Brussels
France,64.3,671308,Paris


So as a summary, [] provides the following convenience shortcuts: <br>

* Series: selecting a label: s[label] <br>
* DataFrame: selecting a single or multiple columns: df['col'] or df[['col1', 'col2']] <br>
* DataFrame: slicing the rows: df['row_label1':'row_label2'] or df[mask]

### Systematic indexing with loc and iloc

When using [] like above, you can only select from one axis at once (rows or columns, not both). For more advanced indexing, you have some extra attributes:
<br>
* loc: selection by label <br>
* iloc: selection by position <br>
These methods index the different dimensions of the frame: <br>

* df.loc[row_indexer, column_indexer] <br>
* df.iloc[row_indexer, column_indexer] <br>
Selecting a single element:

In [16]:
df.loc['Germany', 'area']

357050

In [19]:
# But the row or column indexer can also be a list, slice, boolean array, ..
df.loc['France':'Netherlands', ['area']]

Unnamed: 0_level_0,area
country,Unnamed: 1_level_1
France,671308
Germany,357050
Netherlands,41526


In [20]:
df.loc['France':'Nerherlands', ['area', 'capital']]

Unnamed: 0_level_0,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1
France,671308,Paris
Germany,357050,Berlin


In [22]:
# Selecting by position with iloc works similar as indexing numpy arrays:
df.iloc[0:2, 1] # 0:2 -> Row Index, 1 -> column index

country
Belgium     30510
France     671308
Name: area, dtype: int64

In [23]:
df.iloc[0:2, 1:2]

Unnamed: 0_level_0,area
country,Unnamed: 1_level_1
Belgium,30510
France,671308


In [24]:
df2 = df.copy() # For Copying one data frame to other

In [25]:
df2.head()

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,11.3,30510,Brussels
France,64.3,671308,Paris
Germany,81.3,357050,Berlin
Netherlands,16.9,41526,Amsterdam
United Kingdom,64.9,244820,London


In [26]:
# The different indexing methods can also be used to assign data:
df2.loc['Belgium':'Germany', 'population'] = 10

In [27]:
df2.head() # Population values from Belgium to Germany are updated to 10

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,10.0,30510,Brussels
France,10.0,671308,Paris
Germany,10.0,357050,Berlin
Netherlands,16.9,41526,Amsterdam
United Kingdom,64.9,244820,London


In [29]:
df2.loc['Belgium', 'area'] == 30510

True

### Boolean indexing (filtering)

Often, you want to select rows based on a certain condition. This can be done with 'boolean indexing' (like a where clause in SQL). <br>

The indexer (or boolean mask) should be 1-dimensional and the same length as the thing being indexed.

In [30]:
df['area'] > 100000

country
Belgium           False
France             True
Germany            True
Netherlands       False
United Kingdom     True
Name: area, dtype: bool

In [31]:
df[df['area'] > 100000] # Here it is not assigning to DF yet, just giving the result

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
France,64.3,671308,Paris
Germany,81.3,357050,Berlin
United Kingdom,64.9,244820,London


In [32]:
df.head()

Unnamed: 0_level_0,population,area,capital
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,11.3,30510,Brussels
France,64.3,671308,Paris
Germany,81.3,357050,Berlin
Netherlands,16.9,41526,Amsterdam
United Kingdom,64.9,244820,London


### EXERCISE: Add a column `density` with the population density (note: population column is expressed in millions)

In [33]:
df['density'] = df['population']*1000000 / df['area']

In [34]:
df.head()

Unnamed: 0_level_0,population,area,capital,density
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Belgium,11.3,30510,Brussels,370.37037
France,64.3,671308,Paris,95.783158
Germany,81.3,357050,Berlin,227.699202
Netherlands,16.9,41526,Amsterdam,406.973944
United Kingdom,64.9,244820,London,265.092721


### EXERCISE: Select the capital and the population column of those countries where the density is larger than 300

In [35]:
df[df['density'] > 300]

Unnamed: 0_level_0,population,area,capital,density
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Belgium,11.3,30510,Brussels,370.37037
Netherlands,16.9,41526,Amsterdam,406.973944


In [37]:
df.loc[df['density'] > 300, ['capital', 'population']] # First condition - row index or row columns condition

Unnamed: 0_level_0,capital,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Belgium,Brussels,11.3
Netherlands,Amsterdam,16.9


### EXERCISE: Add a column 'density_ratio' with the ratio of the density to the mean density

In [38]:
df['density_ratio'] = df['density'] / df['density'].mean()

In [39]:
df.head()

Unnamed: 0_level_0,population,area,capital,density,density_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Belgium,11.3,30510,Brussels,370.37037,1.355755
France,64.3,671308,Paris,95.783158,0.350618
Germany,81.3,357050,Berlin,227.699202,0.833502
Netherlands,16.9,41526,Amsterdam,406.973944,1.489744
United Kingdom,64.9,244820,London,265.092721,0.970382


In [40]:
df['population'].mean()

47.739999999999995

### EXERCISE: Change the capital of the UK to Cambridge

In [43]:
df.loc['United Kingdom', 'capital'] = 'Cambridge'

In [44]:
df.head()

Unnamed: 0_level_0,population,area,capital,density,density_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Belgium,11.3,30510,Brussels,370.37037,1.355755
France,64.3,671308,Paris,95.783158,0.350618
Germany,81.3,357050,Berlin,227.699202,0.833502
Netherlands,16.9,41526,Amsterdam,406.973944,1.489744
United Kingdom,64.9,244820,Cambridge,265.092721,0.970382


### EXERCISE: Select all countries whose population density is between 100 and 300 people/km²

In [47]:
df[(df['density'] > 100) & (df['density'] < 300)]

Unnamed: 0_level_0,population,area,capital,density,density_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Germany,81.3,357050,Berlin,227.699202,0.833502
United Kingdom,64.9,244820,Cambridge,265.092721,0.970382


In [46]:
df[(df['density'] > 100) & (df['density'] < 300)]


Unnamed: 0_level_0,population,area,capital,density,density_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Germany,81.3,357050,Berlin,227.699202,0.833502
United Kingdom,64.9,244820,Cambridge,265.092721,0.970382


### Some other useful methods: isin and string methods


In [48]:
# The isin method of Series is very useful to select rows that may contain certain values:
s = df['capital']

In [49]:
s

country
Belgium            Brussels
France                Paris
Germany              Berlin
Netherlands       Amsterdam
United Kingdom    Cambridge
Name: capital, dtype: object

In [50]:
s.isin(['Berlin', 'London'])

country
Belgium           False
France            False
Germany            True
Netherlands       False
United Kingdom    False
Name: capital, dtype: bool

In [51]:
# This can then be used to filter the dataframe with boolean indexing:
df[df['capital'].isin(['Berlin', 'London'])]

Unnamed: 0_level_0,population,area,capital,density,density_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Germany,81.3,357050,Berlin,227.699202,0.833502


In [52]:
# Let's say we want to select all data for which the capital starts with a 'B'. In Python, when having a string, we could use the startswith method:
'Berlin'.startswith('B')

True

In [53]:
# In pandas, these are available on a Series through the str namespace:
df['capital'].str.startswith('B')

country
Belgium            True
France            False
Germany            True
Netherlands       False
United Kingdom    False
Name: capital, dtype: bool

In [54]:
df[df['capital'].str.startswith('B')]

Unnamed: 0_level_0,population,area,capital,density,density_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Belgium,11.3,30510,Brussels,370.37037,1.355755
Germany,81.3,357050,Berlin,227.699202,0.833502


### EXERCISE: Select all countries that have capital names with more than 7 characters

In [56]:
len(df['capital'])

5

In [57]:
df['capital'].str.len()

country
Belgium           8
France            5
Germany           6
Netherlands       9
United Kingdom    9
Name: capital, dtype: int64

In [58]:
df[df['capital'].str.len() > 7]

Unnamed: 0_level_0,population,area,capital,density,density_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Belgium,11.3,30510,Brussels,370.37037,1.355755
Netherlands,16.9,41526,Amsterdam,406.973944,1.489744
United Kingdom,64.9,244820,Cambridge,265.092721,0.970382


### EXERCISE: Select all countries that have capital names that contain the character sequence 'am'

In [59]:
df[df['capital'].str.contains('am')]

Unnamed: 0_level_0,population,area,capital,density,density_ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Netherlands,16.9,41526,Amsterdam,406.973944,1.489744
United Kingdom,64.9,244820,Cambridge,265.092721,0.970382
