### Installing packages

In [None]:
# !pip install pandas
# !pip install seaborn
# !pip install matplotlib

### Importing Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Loading Data

In [None]:
data = pd.read_csv('Iris.csv')
data = data.drop('Id', axis=1)  # we don't neet the ID
data

Datasets included in seaborn: 

https://github.com/mwaskom/seaborn-data


In [None]:
data = sns.load_dataset('iris')
data

### Distinguish attributes

In [None]:
data.info()

In [None]:
data.dtypes

In [None]:
data.describe()  # works only for nominal data

In [None]:
data['species'].unique()  # non nominal data

In [None]:
data['species'].value_counts() 

In [None]:
onehot_data = pd.get_dummies(data)  # one hot encoding ;)
onehot_data

In [None]:
onehot_data.describe()

### Univariate Analysis

In [None]:
data.hist(edgecolor='black')
fig=plt.gcf()
fig.set_size_inches(12,6)

In [None]:
sns.scatterplot(x='species', y='sepal_length', data=data)

In [None]:
sns.set_style('whitegrid')  # turn gridlines on

In [None]:
sns.boxplot(x='species', y='sepal_length', data=data)

In [None]:
sns.swarmplot(x='species', y='sepal_length', data=data)

In [None]:
sns.violinplot(x='species', y='sepal_length', data=data)

In [None]:
data[data['species']=='setosa'].hist()

In [None]:
sns.histplot(data=data, x='sepal_width', hue='species', bins=20)

In [None]:
sns.kdeplot(data=data, x='sepal_width', hue='species')

In [None]:
data.groupby('species')['sepal_width'].skew()

In [None]:
data.groupby('species').skew()

### Bivariate Analysis

In [None]:
sns.pairplot(data=data)

In [None]:
sns.scatterplot(data=data, x="sepal_length", y="petal_width");

In [None]:
sns.scatterplot(data=data, x="sepal_length", y="petal_width", alpha=0.3)

In [None]:
sns.jointplot(data=data, x="sepal_length", y="petal_width", kind="hist", bins=10)

In [None]:
sns.jointplot(data=data, x="sepal_length", y="petal_width", kind="hex", gridsize=20)

In [None]:
sns.jointplot(data=data, x="sepal_length", y="petal_width", kind="kde")

In [None]:
sns.pairplot(data=data, hue='species')

In [None]:
sns.jointplot(data=data, x="sepal_length", y="petal_width", kind="kde", hue='species')

In [None]:
data.corr()

In [None]:
plt.figure(figsize=(7,4))
sns.heatmap(data.corr(), annot=True)

In [None]:
plt.figure(figsize=(7,4))
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1)

In [None]:
plt.figure(figsize=(7,4))
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, cmap='RdBu')   # use a divergent colormap

### Outliers

In [None]:
ax=sns.boxplot(x='species',y='sepal_length',data=data)
ax=sns.stripplot(x='species',y='sepal_length',data=data,jitter=True,edgecolor='gray')

### Feature Engineering / Modeling

In [None]:
from scipy.stats import linregress

x = data['sepal_length']
y = data['petal_width']

res = linregress(x, y)

yhat = res.intercept + res.slope * x

plt.plot(x, y, 'ko')
plt.plot(x, yhat, 'r', label='fitted line')

In [None]:
resid = y - yhat

In [None]:
resid.hist()

In [None]:
resid.describe()

### Model performance metrics

In [None]:
bias = np.mean(resid)
print(f'BIAS = {bias:.2e}')

sse = np.sum(resid ** 2)
print(f'SSE = {sse:.2f}')

mse = np.mean(resid ** 2)
print(f'MSE = {mse:.3f}')

rmse = np.sqrt(np.mean(resid ** 2))
print(f'RMSE = {rmse:.3f}')

mae = np.mean(np.abs(resid))
print(f'MAE = {mae:.3f}')

ame = np.max(np.abs(resid))
print(f'AME = {ame:.3f}')

## Presenting Results

In [None]:
print(f"R-squared: {res.rvalue**2:.6f}")

In [None]:
print(f"slope (95%): {res.slope:.6f} +/- {2*res.stderr:.6f}")
print(f"intercept (95%): {res.intercept:.6f} +/- {2*res.intercept_stderr:.6f}")

use only significant digits

In [None]:
print(f"slope (95%): {res.slope:.2f} +/- {2*res.stderr:.2f}")

In [None]:
# Two-sided inverse Students t-distribution
# p - probability, df - degrees of freedom

from scipy.stats import t
tinv = lambda p, df: abs(t.ppf(p/2, df))

In [None]:
ts = tinv(0.05, len(x)-2)
print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")

In [None]:
print(f"intercept (95%): {res.intercept:.6f} +/- {ts*res.intercept_stderr:.6f}")

In [None]:
sns.lmplot(data=data, x="sepal_length", y="petal_width", x_ci='sd', ci=95)

### Other dataset

Time series data - flights

In [None]:
flights = sns.load_dataset("flights")

flights

In [None]:
flights.info()

In [None]:
sns.lineplot(data=flights, x='year', y='passengers', ci='sd', err_style='band')
sns.scatterplot(data=flights, x='year', y='passengers', hue='month')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)  # Move legend outside figure

In [None]:
# add datetime to flights constructed from year and month column
flights['datetime'] = [pd.to_datetime(f'{x} {y}')  for x, y in  zip(flights['year'], flights['month'])]
plt.plot(flights['datetime'], flights['passengers'])
plt.xlabel('time')
plt.ylabel('# passengers')
plt.xlim((pd.to_datetime('1949-1'), pd.to_datetime('1960-12')));

In [None]:
flights_wide = flights.pivot("year", "month", "passengers")
flights_wide.head()

In [None]:
sns.lineplot(data=flights_wide, palette='viridis')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)  # Move legend outside figure

In [None]:
sns.lineplot(data=flights_wide.T)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)  # Move legend outside figure

In [None]:
g = sns.lineplot(data=flights, x='month', y='passengers', hue='year')
ax = plt.gca()
ax.set_xticks(ax.get_xticks()[::2]);

In [None]:
# Plot each year's time series in its own facet
g = sns.relplot(
    data=flights,
    x="month", y="passengers", col="year", hue="year",
    kind="line", palette="crest", linewidth=4, zorder=5,
    col_wrap=3, height=2, aspect=1.5, legend=False,
)

# Iterate over each subplot to customize further
for year, ax in g.axes_dict.items():

    # Add the title as an annotation within the plot
    ax.text(.8, .85, year, transform=ax.transAxes, fontweight="bold")

    # Plot every year's time series in the background
    sns.lineplot(
        data=flights, x="month", y="passengers", units="year",
        estimator=None, color=".7", linewidth=1, ax=ax,
    )

# Reduce the frequency of the x axis ticks
ax.set_xticks(ax.get_xticks()[::2])

# Tweak the supporting aspects of the plot
g.set_titles("")
g.set_axis_labels("", "Passengers")
g.tight_layout()

In [None]:
quartet = sns.load_dataset("anscombe")

# Show the results of a linear regression within each dataset
sns.relplot(x="x", y="y", col="dataset", hue="dataset", 
            data=quartet, col_wrap=2, ci=None, 
            palette="muted", height=4, s=50, alpha=1);

In [None]:
# Compute correlation for each dataset separately
quartet.groupby('dataset').corr()

In [None]:
# Show the results of a linear regression within each dataset
sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=quartet,
           col_wrap=2, ci=None, palette="muted", height=4,
           scatter_kws={"s": 50, "alpha": 1});