# Exploratory Data Analysis

In [None]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

## Import Data

pandas is used to import the data for simplicity, as compared to Python's built-in `open()` method.

The file that we will be analysing is the Iris dataset originally from https://archive.ics.uci.edu/ml/datasets/iris.

The features are:
- `sepal_length`: sepal length in cm
- `sepal_width`: sepal width in cm
- `petal_length`: petal length in cm
- `petal_width`: petal width in cm
- `variety`: class of the Iris
    - Setosa
    - Versicolour
    - Virginica
    
Documentation for importing data using pandas: https://pandas.pydata.org/pandas-docs/stable/reference/io.html

In [None]:
df = pd.read_csv('/kaggle/input/iris-flower-dataset/IRIS.csv')
df.head()

In [None]:
df.columns = df.columns[:-1].tolist() + ['variety']
df['variety'] = df['variety'].apply(lambda species: species.split('-')[-1].capitalize())

## Describing the Data

The `describe` function can be used to compute summary statistics of non-null (NaN) values of each feature. Pass `all` to the optional parameter `include` to retrieve summaries for both numerical and categorical features. There are other optional parameters like `percentiles` where you can pass in a list of percentiles to retrieve their respective values.

In [None]:
df.describe()

In [None]:
df.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9], include='all')

## Peek into a Sample of the Data

Use `head` and `tail` to retrieve the top and bottom `n` rows. Use `sample` to randomly view (sample) `n` rows

In [None]:
df.head()

In [None]:
df.head(2)

In [None]:
df.tail()

In [None]:
df.sample()

In [None]:
df.sample(5)

## Filter (Query) Data

The `loc`ate indexer allows for access to rows by column names or a boolean array.

In [None]:
df.loc[df['petal_width'] > df['sepal_width']]

In [None]:
df.loc[df['variety'] == 'Versicolor'].sample(10)

## Identify Missing Values

The `isna` method returns a list of boolean values, and `sum` adds up the `True` values (interpreting them as ones).

Since the data has no missing values, the `replace` method is used to simulate some missing values - in this case, all `petal_length` that are more than 5cm are replaced with `np.nan` (**n**ot **a** **n**umber from the NumPy package).

Methods to handle missing data:
- Deletion
- Imputation: Replacing with mean, mode, median, etc.

In [None]:
df.isna().sum()

In [None]:
missing_df = df.copy()
missing_df['petal_length'] = missing_df['petal_length'].loc[missing_df['petal_length'] > 5] \
                                .replace('Versicolor', np.nan)
missing_df['petal_length'].describe()

In [None]:
missing_df.isna().sum()

In [None]:
missing_df.shape

### Deletion

`dropna` removes missing values. By default, the `axis` parameter is `0` which stands for row-wise deletion.

In [None]:
# Row-wise deletion
missing_df.dropna(axis=0).shape

In [None]:
# Column-wise deletion
missing_df.dropna(axis=1).shape

### Imputation

`fillna` is useful to populate missing values.

In [None]:
mean = missing_df['petal_length'].mean()
mean

In [None]:
missing_df['petal_length'] = missing_df['petal_length'].fillna(mean)
missing_df.isna().sum()

## Identify Outliers

An example to remove (filter away) outliers is to remove values above and below 3 standard deviation from the mean.

Ways to handle outliers:
- Deletion
- Transformation: Natural log
- Imputation (similar to missing values)

In [None]:
def three_sd_range(series):
    mean = series.mean()
    sd = series.std()
    low = mean - 3 * sd
    high = mean + 3 * sd
    return (low, high)

In [None]:
lower, upper = three_sd_range(df['sepal_length'])
lower, upper

In [None]:
df['sepal_length'].describe()

In [None]:
df['sepal_length'].loc[pd.DataFrame([df['sepal_length'] >= lower, df['sepal_length'] <= upper]).all()] \
    .describe()

# Feature Engineering

## Encode Labels (Factorise)

In [None]:
labels, uniques = pd.factorize(df['variety'])
df['class'] = labels
df.head()

In [None]:
dict(zip(range(len(uniques)), uniques))

## Bin Values

In [None]:
df['sepal_size'] = pd.cut(df['sepal_length'], 
                          bins=[0, 5, 7, 100], 
                          labels=['Small', 'Medium', 'Large'])

df.groupby(by=['sepal_size'])['sepal_length'].agg(['min', 'max'])

## Scaling

In [None]:
df['sepal_length'].describe()

In [None]:
scaler = StandardScaler()
df['sepal_length_scaled'] = scaler.fit_transform(df['sepal_length'].to_numpy().reshape(-1, 1))

df['sepal_length_scaled'].describe()

Reference: https://www.datacamp.com/community/tutorials/exploratory-data-analysis-python

# Data Visualisation

## Matplotlib

In [None]:
variety_counts = df['variety'].value_counts()
plt.bar(variety_counts.index, variety_counts.tolist())
plt.show()

In [None]:
plt.boxplot(df['sepal_length'])
plt.show()

In [None]:
plt.scatter(df['sepal_length'], df['sepal_width'])
plt.show()

## pandas

In [None]:
df['variety'].value_counts()

In [None]:
df['variety'].value_counts().plot(kind='bar')
plt.show()

In [None]:
df['sepal_length'].plot(kind='box')
plt.show()

In [None]:
df[['sepal_length', 'sepal_width']].plot(kind='scatter', x='sepal_length', y='sepal_width')
plt.show()

## Seaborn

In [None]:
# sns.palplot(sns.color_palette())

In [None]:
sns.countplot('variety', data=df, color=sns.color_palette()[0])
plt.show()

In [None]:
sns.countplot('variety', hue='sepal_size', data=df)
plt.show()

In [None]:
sns.boxplot('sepal_length', data=df, orient='v')
plt.show()

In [None]:
sns.boxplot(x='variety', y='sepal_length', data=df)
plt.show()

In [None]:
sns.scatterplot(x='sepal_length', y='sepal_width', data=df)
plt.show()

In [None]:
sns.scatterplot(x='sepal_length', y='sepal_width', hue='variety', data=df)
plt.show()