In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_style(style = 'whitegrid')

# Read Data

Today, we will use CPS1985 data on the determinant of hourly wages

https://vincentarelbundock.github.io/Rdatasets/csv/AER/CPS1985.csv

The data description is available here:
https://rdrr.io/cran/AER/man/CPS1985.html

In [None]:
df_wage = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/AER/CPS1985.csv")

In [None]:
df_wage.head()

In [None]:
df_wage.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [None]:
df_wage.describe()

In [None]:
df_wage.corr()

# Visualise one variable

## Distribution plot

In [None]:
sns.histplot(x = 'wage', data = df_wage)

In [None]:
sns.histplot(x = 'wage', data = df_wage, bins = 100)

In [None]:
sns.kdeplot(x = 'wage', data = df_wage)

In [None]:
sns.kdeplot(x = 'wage', data = df_wage, bw_adjust=.1)

In [None]:
sns.histplot(x = 'wage', data = df_wage, kde = True)

### Distribution plot, multiple

In [None]:
vars = ['wage', 'age', 'education', 'experience']

fig, axes = plt.subplots(2, 2, figsize = (8, 4))
axes = axes.flatten()

for i in range(len(vars)):
  sns.histplot(x = vars[i], data = df_wage, ax = axes[i], kde = True)#i // 2, i % 2], kde = True)

plt.tight_layout()

## Count plot

In [None]:
sns.countplot(x = 'sector', data = df_wage)

In [None]:
sns.countplot(y = 'sector', data = df_wage)

In [None]:
vars = ['gender', 'region', 'ethnicity']

fig, axes = plt.subplots(1, 3, figsize = (8, 4))

for i in range(3):
  sns.countplot(x = vars[i], data =df_wage, ax = axes[i])

plt.tight_layout()


# Visualise two variables

## Scatter plot

In [None]:
sns.scatterplot(x = 'education', y = 'wage', data = df_wage)

In [None]:
sns.scatterplot(x = 'education', y = 'wage', data = df_wage, alpha = .5)

## 2d-density Plot

In [None]:
sns.kdeplot(x = "education", y = "wage", 
                data = df_wage, fill = True)

## Boxplot, violinplot

In [None]:
sns.boxplot(x = 'ethnicity', y = 'wage', data = df_wage)

In [None]:
sns.violinplot(x = 'ethnicity', y = 'wage', data = df_wage)

## Barplot

In [None]:
sns.barplot(x = 'ethnicity', y = 'wage', data = df_wage, estimator = np.mean)

In [None]:
sns.barplot(x = 'ethnicity', y = 'wage', data = df_wage, estimator = np.std)

# Adding more layers

## Add a variable to scatter plots

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x = "education", y = "wage", hue = 'gender',
                data = df_wage, alpha=.4)

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x = "education", y = "age", hue = 'wage',
                data = df_wage, palette = "Greens")

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x = "education", y = "age", size = 'wage', 
                sizes = (40, 400),
                data = df_wage, alpha = .3)

## Colours in categorical plots

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(x = "wage", hue = 'gender',  multiple="stack",
                data = df_wage)

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x = "ethnicity", y = "wage", hue = 'gender',
                data = df_wage)

# Misc

## Joint plot

In [None]:
plt.figure(figsize = (5, 5))
sns.jointplot(x ='education', y = "wage",
                data = df_wage)

In [None]:
plt.figure(figsize = (5, 5))
sns.jointplot(x ='education', y = "wage", hue = 'gender', alpha = .5,
                data = df_wage)

In [None]:
plt.figure(figsize = (5, 5))
sns.jointplot(x ='education', y = "wage",
                data = df_wage, kind = 'reg')

In [None]:
plt.figure(figsize = (5, 5))
sns.jointplot(x ='education', y = "wage", 
                data = df_wage, kind = 'hex')

In [None]:
sns.pairplot(df_wage).savefig("pair.png")

In [None]:
sns.pairplot(df_wage, hue = 'gender', palette="Set1")