# Data Exploration with Pandas

## Load the `python` modules

In [None]:
import pandas as pd
import numpy as np
import operator
import seaborn as sns
import matplotlib.pyplot as plt

#from sklearn import datasets
from sklearn.datasets import load_iris

%matplotlib inline

### Load the dataset

In [None]:
iris = load_iris()

#### Explore the information within the dataset:

In [None]:
iris

A sample of the dataset's information:

```python
'data': array([[5.1, 3.5, 1.4, 0.2],
              ...
              [5.9, 3. , 5.1, 1.8]]),
              
'target': array([0, 0, 0, ..., 1, 1, 1, ..., 2, 2, 2]),

'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),

'feature_names': ['sepal length (cm)',
                  'sepal width (cm)',
                  'petal length (cm)',
                  'petal width (cm)'],
  
'filename': 'iris.csv',
```

#### Extract some of this information:

In [None]:
# the names of the columns (features)
iris.feature_names

In [None]:
# the names of the target classes:
iris.target_names
target_names = iris.target_names
print(target_names)

In [None]:
# the target classes coding is accessed as:
iris.target

## Convert the dataset into a pandas dataframe:

# Table 3.1: Common DataFrame Operations

| DESCRIPTION |CODE EXAMPLES |
|:-------------|:-------------|
|Extract a range of rows using row numbers |`df[2:4]`|
| | `df.iloc[2:4]` |
|Extract a single row using row number | `df.iloc[2]`|
|Extract a range of rows and range of columns | `df.iloc[2:4, 1:4]`|
|Extract a range of rows and specific columns using positional values | `df.iloc[2:4, [1,3]]`|
|Extract specific row(s) and column(s)| `df.iloc[[2,4], [1,3]]` |
|Extract a range of rows using labels | `df['20190601':'20190603']`|
|Extract a single row based on its label | `df.loc['20190601'] `|
|Extract specific row(s) using their labels | `df.loc[[date1,date2]]`|
|Extract specific row(s) and column(s) using their labels | `df.loc[[date1,date2], ['A','C']]`|
| | `df.loc[[date1,date2], 'A':'C']` |
|Extract a range of rows and columns using their labels | `df.loc[date1:date2, 'A':'C']` |

In [None]:
# convert the dataset into a pandas dataframe (just a matter of taste!)
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

In [None]:
iris_df.index

In [None]:
iris_df.isnull().any()

In [None]:
iris_df.isnull().sum(axis=1).loc[:10]

In [None]:
iris_df.isnull().sum()

In [None]:
iris_df.isnull().mean()*100

In [None]:
iris_df.values

In [None]:
iris_df.shape

In [None]:
iris_df.dtypes

#### Check if there are duplicate entries:

In [None]:
iris_df.duplicated()

In [None]:
duplicates = iris_df.duplicated()
iris_df[duplicates]

#### Removing duplicate rows

**`CAUTION`: if necessary, depends on the case study**
```python
iris_df.drop_duplicates(inplace=True)
```

#### Display the FIRST five entries of the dataframe:

In [None]:
iris_df.head()

#### Display the LAST five entries of the dataframe:

In [None]:
iris_df.tail()

#### Change the names of the columns (features):

In [None]:
iris_features = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
iris_df.columns = iris_features

### Add a column of the target classes to the dataframe:

In [None]:
iris_df["target"] = iris.target
iris_df.head()

### Display selected columns only:

In [None]:
iris_df.sepal_length

In [None]:
iris_df['sepal_length']

In [None]:
iris_df[['sepal_length', 'sepal_width']]

In [None]:
# grouped selected columns:
iris_df[iris_features]

### Display selected entries only:

In [None]:
iris_df.index

#### Extract rows based on row number:

In [None]:
iris_df.iloc[0]

In [None]:
iris_df.iloc[1]

In [None]:
iris_df.iloc[0:3]

In [None]:
iris_df.iloc[0:12:3]

In [None]:
iris_df.iloc[-1]

### Slicing Based on Row and Column Numbers

In [None]:
iris_df.iloc[2:4, 1:4]

In [None]:
iris_df.iloc[[2,4], [1,3]]

In [None]:
iris_df.loc[0]

In [None]:
iris_df.loc[0, ['sepal_length', 'petal_width']]

In [None]:
iris_df.loc[0:3, ['sepal_length', 'petal_width']]

In [None]:
iris_df.loc[0:3, 'sepal_length':'petal_width']

#### Selecting a Single Cell in a DataFrame:

In [None]:
iris_df.at[2, 'sepal_length']

In [None]:
iris_df.at[3, 'petal_length']

In [None]:
iris_df.sepal_length[0]

### Selecting Based on Cell Value

In [None]:
iris_df[(iris_df.sepal_length > 6.8) & (iris_df.petal_length > 6.0)]

In [None]:
iris_df[iris_df.sepal_length > 7.4]

### Add a column for the target classes that contains the names of the species:

#### First way:

In [None]:
# add a target column with the target names
iris_df['species'] = iris_df['target']
iris_df.head()

In [None]:
# change the 0, 1, 2 coding to the species name:
iris_df['species'].replace([0, 1, 2], target_names, inplace=True)
iris_df.head()

#### An alternative way:

In [None]:
iris_df['species_alt'] = iris_df.target
iris_df.head()

In [None]:
iris_df.loc[iris_df.species_alt == 0, 'species_alt'] = 'sentosa'
iris_df.loc[iris_df.species_alt == 1, 'species_alt'] = 'versicolor'
iris_df.loc[iris_df.species_alt == 2, 'species_alt'] = 'virginica'
iris_df.head()

#### A more `pythonian` way:

In [None]:
iris_df['species_alt2'] = [iris.target_names[x] for x in iris.target]
iris_df

### Remove columns or rows:

In [None]:
iris_df.drop('species_alt2', axis=1)

In [None]:
iris_df.head()

In [None]:
iris_df.drop(iris_df.columns[-1], axis=1)

In [None]:
iris_df.head()

In [None]:
iris_df = iris_df.drop(['species_alt','species_alt2'], axis=1)
iris_df.head()

In [None]:
pd.get_dummies(iris_df, drop_first=False)

### Descriptive Statistics on the DataFrame

In [None]:
# Check the data type of each field:
iris_df.info()

In [None]:
iris_df.mean()

In [None]:
# compute the mean for each column:
iris_df.mean(0)

In [None]:
# compute the mean for each row:
iris_df.mean(1)

In [None]:
iris_df['sepal_length'].mean()

#### Check if the instance is a dataframe or a series:

In [None]:
isinstance(iris_df, pd.DataFrame)

In [None]:
isinstance(iris_df, pd.Series)

#### The function `describe()`

In [None]:
iris_df.describe()

In [None]:
iris_df.sepal_length.describe()

In [None]:
iris_df.describe().iloc[:, :2]

#### The function `sort_values()`

In [None]:
iris_df.sort_values(by='sepal_length', ascending=True).head()

In [None]:
iris_df.groupby('species').mean()

### Transpose the dataframe:

In [None]:
iris_df.T

In [None]:
gg = iris_df.transpose()
gg.head()

In [None]:
gg.index

In [None]:
gg.columns

In [None]:
gg.iloc[0:3, 1:5]

# Plots with Seaborn

### Box plots

A box plot gives an idea of the distribution and skewness of a variable, based on statistical parameters, and indicates the presence of outliers (denoted by circles or dots).

https://seaborn.pydata.org/generated/seaborn.boxplot.html

In [None]:
sns.boxplot(iris_df['sepal_length'])

In [None]:
sns.boxplot(data=iris_df[iris_features])

### Kernel density estimate

The kernel density estimate is a plot for visualizing the probability distribution of a continuous variable.

https://seaborn.pydata.org/generated/seaborn.kdeplot.html

In [None]:
sns.kdeplot(iris_df['sepal_length'])

In [None]:
sns.kdeplot(data=iris_df[iris_features])

### Violin plot

A violin plot merges the box plot with the kernel density plot, with the shape of the violin representing the frequency distribution.

https://seaborn.pydata.org/generated/seaborn.violinplot.html

In [None]:
sns.violinplot(x='species', y='petal_width', data=iris_df)

In [None]:
sns.violinplot(x='species', y='petal_width', data=iris_df, hue='species')

In [None]:
sns.violinplot(data=iris_df[iris_features])

### Count plots

Count plots are used to plot categorical variables, with the length of the bars representing the number of observations for each unique value of the variable.

https://seaborn.pydata.org/generated/seaborn.countplot.html

In [None]:
sns.countplot(x=iris_df['species'])

### Plot pair-wise the features of the dataframe

A pair plot is one that shows bivariate relationships between all possible pairs of variables in the dataset.

Notice that you do not have to supply any column names as arguments since all the variables in the dataset are considered automatically for plotting.

https://seaborn.pydata.org/generated/seaborn.pairplot.html

In [None]:
sns.pairplot(iris_df, vars = iris_df.columns[0:4], hue='species', palette='husl')

### Joint plot

The joint plot displays the relationship between two variables as well as the individual distribution of the variables.

https://seaborn.pydata.org/generated/seaborn.jointplot.html

In [None]:
sns.jointplot(x='sepal_length', y='petal_width', hue='species', data=iris_df)

In [None]:
sns.jointplot(x='sepal_length', y='petal_width', hue='species', data=iris_df, kind='scatter')

In [None]:
sns.jointplot(x='sepal_length', y='petal_width', hue='species', data=iris_df, kind='hist')

In [None]:
sns.jointplot(x='sepal_length', y='petal_width', hue='species', data=iris_df, kind='kde')

### Correlation matrix in heatmap format

https://seaborn.pydata.org/generated/seaborn.heatmap.html

In [None]:
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(iris_df[iris_features].corr(), annot=True, cmap='YlGnBu', square=True)

### Strip plot

A strip plot is similar to a scatter plot. The difference lies in the type of variables used in a strip plot. While a scatter plot has both variables as continuous, a strip plot plots one categorical variable against one continuous variable.

https://seaborn.pydata.org/generated/seaborn.stripplot.html

In [None]:
sns.stripplot(x='species', y='sepal_length', data=iris_df)

In [None]:
sns.stripplot(data=iris_df[iris_features])

### Swarm plot

A swarm plot is similar to a strip plot, the difference being that the points in a swarm plot are not overlapping like those in a strip plot. With the points more spread out, we get a better idea of the distribution of the continuous variable.

https://seaborn.pydata.org/generated/seaborn.swarmplot.html

In [None]:
sns.swarmplot(x='species', y='sepal_length', data=iris_df)

### Distibution plot

https://seaborn.pydata.org/generated/seaborn.displot.html

In [None]:
sns.displot(data=iris_df, x='petal_length', kde=True)

In [None]:
sns.displot(data=iris_df, x='petal_length', y='sepal_length')

In [None]:
sns.displot(data=iris_df, x='petal_length', y='sepal_length', kind="kde")

In [None]:
g = sns.displot(data=iris_df, x='petal_length', y='sepal_length', kind='kde', rug=True)

In [None]:
sns.displot(data=iris_df, x='petal_length', hue='species', kind='kde')

In [None]:
sns.displot(data=iris_df, x='petal_length', hue='species', multiple='stack')