# Visualisation 01
# Pandas & Seaborn

In [None]:
# Here we import all packages we need.

# This is a package for handling and manipulating data.
import pandas as pd

# This is the standard package for plotting.
from matplotlib import pyplot as plt 

# Seaborn is build on matplotlib.
# It is more confortable to use than pure matplotlib.
import seaborn as sns

#### Loading a dataset
We will be loading the penguins dataset from the seaborn library.  

The penguins dataset contains information for 344 penguins living on the islands of the Palmer Archipelago, Antarctica. The data was collected by Dr. Kristen Gorman and the Palmer Station, Antarctica 

In [None]:
penguins = sns.load_dataset("penguins")

We will get a **data frame object**.  
This dataframe is easy to manipulate with seabron in-built functions.

In [None]:
# We can see the summary by calling the variable
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


We can see that there are 344 rows and 7 columns.

In [None]:
# The '.head()' method allows us to view first 5 rows of the dataset
penguins.head() 

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


We will use the **'.describe()'** method on our varibale dataframe (we will now refer to it as df) to compute **the basic statistics over all numeric values**

In [None]:
penguins.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


<span style="cfont-size:5em">**Dealing with missing data** </span>  
One of the most important step is to deal with missing data  
The easiest ways is to simply drop the entire entry if there are only a few.

In [None]:
# This allows us to drop all rows containing 'NaN'.
penguins = penguins.dropna()
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


Note we have dropped 1 row which had NaN value.  
So, now we have 33 rows and 7 columns

#### Summary of Statistics

In [None]:
# The minimum value
penguins.min()

species              Adelie
island               Biscoe
bill_length_mm         32.1
bill_depth_mm          13.1
flipper_length_mm     172.0
body_mass_g          2700.0
sex                  Female
dtype: object

In [None]:
# The maximum value
penguins.max()

species                 Gentoo
island               Torgersen
bill_length_mm            59.6
bill_depth_mm             21.5
flipper_length_mm        231.0
body_mass_g             6300.0
sex                       Male
dtype: object

In [None]:
# The median
penguins.median()

  


bill_length_mm         44.5
bill_depth_mm          17.3
flipper_length_mm     197.0
body_mass_g          4050.0
dtype: float64

In [None]:
# The 0.1 quantile or the 10% percentile
penguins.quantile(0.1)

bill_length_mm         36.60
bill_depth_mm          14.32
flipper_length_mm     185.00
body_mass_g          3300.00
Name: 0.1, dtype: float64

In [None]:
# Number o fentries that are not "NaN"
penguins.count()

species              333
island               333
bill_length_mm       333
bill_depth_mm        333
flipper_length_mm    333
body_mass_g          333
sex                  333
dtype: int64

We can see that since we have removed NaN the data is now updated

In [None]:
# .describe() method is a convenient tool that computes multiple stats at once.
penguins.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,333.0,333.0,333.0,333.0
mean,43.992793,17.164865,200.966967,4207.057057
std,5.468668,1.969235,14.015765,805.215802
min,32.1,13.1,172.0,2700.0
25%,39.5,15.6,190.0,3550.0
50%,44.5,17.3,197.0,4050.0
75%,48.6,18.7,213.0,4775.0
max,59.6,21.5,231.0,6300.0


In [None]:
# This gives us the number of rows; 344 rows/penguins in total
# We dropped one because of NaN values.
len(penguins)

333

In [None]:
# This gives us the number of columns; 
# it consists of 7 columns/variables that have been measured for each penguin
len(penguins.columns)

7

#### Accessing Columns

In [None]:
# Find out the names of the columns
penguins.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

In [None]:
# Find out uniques entries in a specific column
penguins['island'].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

Accessing multiple columns

In [None]:
columns_list=['island','species','flipper_length_mm']
islands_and_species=penguins[columns_list]

# We could also do this in a single line:
# islands_and_species=penguins[['island','species','flipper_length_mm']]

islands_and_species
# Note that the order of the columns changed

Unnamed: 0,island,species,flipper_length_mm
0,Torgersen,Adelie,181.0
1,Torgersen,Adelie,186.0
2,Torgersen,Adelie,195.0
4,Torgersen,Adelie,193.0
5,Torgersen,Adelie,190.0
...,...,...,...
338,Biscoe,Gentoo,214.0
340,Biscoe,Gentoo,215.0
341,Biscoe,Gentoo,222.0
342,Biscoe,Gentoo,212.0


#### Accessing rows

In [None]:
# Let us say we are interested in the penguin with the index 2.
# We can access this row using '.loc'
row=penguins.loc[2]
row

species                 Adelie
island               Torgersen
bill_length_mm            40.3
bill_depth_mm             18.0
flipper_length_mm        195.0
body_mass_g             3250.0
sex                     Female
Name: 2, dtype: object

In [None]:
# We can also access a range of rows with '.loc'
# This will return a data frame with everything between the rows with indexes 2 and 5
# Note, differently from standard python indexing, this includes element 5 
row=penguins.loc[2:5]
row.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


#### Filtering rows based on condition

We want to select only these rows where 'island' has the value 'Biscoe'. We can use the '.query()' method.   
It takes a string as input.

In [None]:
biscoe=penguins.query('island == "Biscoe"')
biscoe

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
20,Adelie,Biscoe,37.8,18.3,174.0,3400.0,Female
21,Adelie,Biscoe,37.7,18.7,180.0,3600.0,Male
22,Adelie,Biscoe,35.9,19.2,189.0,3800.0,Female
23,Adelie,Biscoe,38.2,18.1,185.0,3950.0,Male
24,Adelie,Biscoe,38.8,17.2,180.0,3800.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [None]:
penguins.query('(body_mass_g > 4000) or ((species == "Chinstrap") and (flipper_length_mm < 200)) ')

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male
14,Adelie,Torgersen,34.6,21.1,198.0,4400.0,Male
17,Adelie,Torgersen,42.5,20.7,197.0,4500.0,Male
19,Adelie,Torgersen,46.0,21.5,194.0,4200.0,Male
35,Adelie,Dream,39.2,21.1,196.0,4150.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


#### Sorting

In [None]:
# We sort in ascending order according to 'flipper_length_mm'

penguins_sort = penguins.sort_values('flipper_length_mm', ascending=True)
penguins_sort.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
28,Adelie,Biscoe,37.9,18.6,172.0,3150.0,Female
20,Adelie,Biscoe,37.8,18.3,174.0,3400.0,Female
122,Adelie,Torgersen,40.2,17.0,176.0,3450.0,Female
158,Chinstrap,Dream,46.1,18.2,178.0,3250.0,Female
98,Adelie,Dream,33.1,16.1,178.0,2900.0,Female


#### Renaming Columns

In [None]:
# This is renaming 'bill_length_mm' to 'bill length (mm)'.
penguins = penguins.rename(columns={'bill_length_mm':'bill length (mm)'})
penguins

Unnamed: 0,species,island,bill length (mm),bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


#### Adding new columns Using Arthmetic

If we want an additional column that contains the bill depth in cm instead of millimeters. We can achieve this with the following syntax.

In [None]:
#We can simply reference the new column and it pandas will create it.
penguins['bill depth (cm)'] = penguins['bill_depth_mm'] / 10.0
penguins.head()

Unnamed: 0,species,island,bill length (mm),bill_depth_mm,flipper_length_mm,body_mass_g,sex,bill depth (cm)
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1.87
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,1.74
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,1.8
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,1.93
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,2.06
