# Visualization by Matplot & Seaborn

This notebook shows how to create basic plots by Matplot & Seaborn.
1. Data Preparation
2. Histogram
3. Count Plot
4. Box Plot
5. Violin Plot
6. Boxen Plot
7. Swarm Plot
8. Strip Plot
9. Scatter Plot
10. Bubble Plot
11. Pair Plot
12. Joint Plot (and Hex Plot)
13. Bar Plot
14. Stacked Bar Plot
15. Pie Chart (and Donut Pie Chart)
16. Mosaic Plot
17. Heatmap

## 1. Data Preparation

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
print("Data Shape", data.shape)
data.head()

In [None]:
# Drop id because it is not necessary
data = data.drop('id', axis=1)

In [None]:
# Descriptive statistics
data.describe()

In [None]:
# Information of the data
data.info()

In [None]:
#  Encode by lable encoding
from sklearn.preprocessing import LabelEncoder

cat_cols = ['gender','ever_married','work_type','Residence_type','smoking_status']
le = LabelEncoder()
data[cat_cols] = data[cat_cols].apply(le.fit_transform)
data.head()

In [None]:
# Create a list of columns
data_cols = data.columns

In [None]:
# Impute missing values with KNN
from sklearn.impute import KNNImputer
from numpy import isnan

# Define imputer
imputer = KNNImputer()

# fit on the dataset
imputer.fit(data)

# Transform the dataset
data = imputer.transform(data)

# Convert to dataframe
data = pd.DataFrame(data)
data.columns = data_cols

data.info()

In [None]:
# Create data with labels
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df = df.drop('id', axis=1)
df['bmi']=data['bmi']

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Import libraries: Matplot & Seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 2. Histogram

In [None]:
# Show histgram of bmi
sns.histplot(x='bmi', data=df)

In [None]:
# Show histogram: horizontal direction
sns.histplot(y='bmi', data=df)  # Set the focal data as y

In [None]:
# Set the number of bins
sns.histplot(x='bmi', data=df, bins=10)  #set bins argument 

In [None]:
# Set the width of bins
sns.histplot(x='bmi', data=df, binwidth=10) # set binwidth argument

In [None]:
# Set the width of each bin's outer frame: linewidth=0.01
# Set the color of bins: color="royalblue" 
sns.histplot(x='bmi', data=df, linewidth=0.01, color="royalblue")

In [None]:
# Show the bmi by gender on the same plot area
sns.histplot(x='bmi', data=df[df["gender"]=="Male"], linewidth=0.01, color="royalblue")
sns.histplot(x='bmi', data=df[df["gender"]=="Female"], linewidth=0.01, color="red")

Two histograms are overlapped and thus hard to see.

In [None]:
# Increase the transparency of the female histogram to improve the visibility
sns.histplot(x='bmi', data=df[df["gender"]=="Male"], linewidth=0.01, color="royalblue")
sns.histplot(x='bmi', data=df[df["gender"]=="Female"], linewidth=0.01, color="red", alpha=0.3)

## 3. Countplot

In [None]:
# Create a countplot
sns.countplot(x="work_type", data=df, color="royalblue")

In [None]:
# Display the same plot horizontally
sns.countplot(y='work_type', data=df, color="royalblue")  # Set the variable as y
plt.show()

In [None]:
# Create the countplot by gender
sns.countplot(x="work_type", data=df, 
              hue="gender")            # set hue=gender

In [None]:
# Change the hue order 
sns.countplot(x="work_type", data=df, hue="gender", 
              hue_order=["Female","Male","Others"])  # Set hue_order as a list

In [None]:
# Set the order of the category
sns.countplot(x="work_type", data=df, 
              hue="gender", hue_order=["Female","Male","Others"],
              order=["Never_worked","children","Govt_job","Self-employed","Private"])  # Set order as a list

In [None]:
# Change the color of the bar
sns.countplot(x="work_type", data=df, hue="gender", palette="Set2")  # Set palette

In [None]:
# Set the background of the plot
sns.set_style("darkgrid")

# Set the outframe of each bar
sns.countplot(x="work_type", data=df, hue="gender", palette="Set2", linewidth=0.0)  # Set linewidth

# Insert legend and decide its position
plt.legend(loc='upper right')

Style can be: white, dark, whitegrid, darkgrid, and ticks.

## 4. Boxplot

In [None]:
# Create a box plot of bmi
sns.boxplot(y='bmi', data=df)

In [None]:
# Show the boxplot in horizontal direction
sns.boxplot(x='bmi', data=df)

In [None]:
# Create a box plot of bmi by smoking status
sns.boxplot(x='smoking_status', y='bmi', data=df)

In [None]:
# Change the box color and width of outframe
plt.figure(figsize=(10, 5))
sns.boxplot(x='smoking_status', y='bmi', 
            data=df, palette='winter',   # set the box color by settin palette
            linewidth=0.6)  # set the box's outframe width by setting linewidth

In [None]:
# Create bmi box plot by smoking status and gender
plt.figure(figsize=(10, 5))
sns.boxplot(x='smoking_status', y='bmi', data=df, hue='gender',  # Set hue to gender
            palette='winter', linewidth=0.6)

## 5. Violin Plot

In [None]:
plt.figure(figsize=(8, 6))
sns.violinplot(x="smoking_status", y="bmi", hue='gender', data=df,
              palette='winter', linewidth=0.1)

## 6. Boxen Plot

In [None]:
plt.figure(figsize=(8, 6))
sns.boxenplot(x="smoking_status", y="bmi", hue='gender', data=df,
             palette='winter', linewidth=0.1)

## 7. Swarm plot

In [None]:
plt.figure(figsize=(10, 8))
sns.swarmplot(x="smoking_status", y="bmi", hue='gender', data=df)

## 8. Strip Plot

In [None]:
plt.figure(figsize=(10, 8))
sns.stripplot(x="smoking_status", y="bmi", hue='gender', data=df)

## Putting them together

In [None]:
# Show the various plots in a line using subplot
fig, ax = plt.subplots(nrows=1, ncols=5, sharey=True, figsize=(15,5))
sns.boxplot(x='smoking_status', y='bmi', data=df, linewidth=0.1, ax=ax[0])
sns.violinplot(x="smoking_status", y="bmi", data=df, linewidth=0.1, ax=ax[1])
sns.boxenplot(x="smoking_status", y="bmi", data=df, linewidth=0.1, ax=ax[2])
sns.swarmplot(x="smoking_status", y="bmi", data=df, ax=ax[3])
sns.stripplot(x="smoking_status", y="bmi", data=df, ax=ax[4])
plt.tight_layout() # Calibrate the space between plots
plt.show()

In [None]:
# Show 2 by 2 boxen plot (also called as letter value boxplots)
fig, ax = plt.subplots(nrows=2, ncols=2, sharey=True, figsize=(14,7))
sns.boxenplot(x='gender', y='bmi', data=df, palette='winter', linewidth=0.1, ax=ax[0][0])
sns.boxenplot(x="work_type", y="bmi", data=df, palette='winter', linewidth=0.1, ax=ax[0][1])
sns.boxenplot(x="Residence_type", y="bmi", data=df, palette='winter', linewidth=0.1, ax=ax[1][0])
sns.boxenplot(x="smoking_status", y="bmi", data=df, palette='winter', linewidth=0.1, ax=ax[1][1])
plt.tight_layout() 
plt.show()

## 9. Scatter Plot

In [None]:
# Create a scatterplot to show the relationship bmi and glucose level
sns.scatterplot(x='bmi', y='avg_glucose_level', data=df)

In [None]:
# Show the relationship by gender
plt.figure(figsize=(10, 8))
sns.scatterplot(x='bmi', y='avg_glucose_level', data=df, hue='gender')

Dots are overlapped, and thus hard to see.

In [None]:
# Increase the transparency of dots to improve the visibility
plt.figure(figsize=(10, 8))
sns.scatterplot(x='bmi', y='avg_glucose_level', data=df, hue='gender', alpha=0.6)  # set alpha to calibrate transperancy

## 10. Bubble Chart

In [None]:
# Create a bubble chart to show the relationship between three varibles: age, glucose level, and bmi
# X-axis, y-axis, and dot size represent age, glucose, and bmi, respectively.
plt.figure(figsize=(10, 8))
ax = sns.scatterplot(x='age', size='bmi', data=df,          # set size to bmi to represent bmi level by dot size 
                    y='avg_glucose_level', sizes=(10,400))  # set sizes to determin the max and min dot sizes
ax.legend(loc='upper left')

In [None]:
# Use hue to represent relationships between 4 variables
plt.figure(figsize=(10, 8))
ax = sns.scatterplot(x='age', size='bmi', data=df,           
                    y='avg_glucose_level', sizes=(10,400), hue='gender')
ax.legend(loc='upper left')

## 11. Pairplot

In [None]:
# Using parplot to get scatterplots between all variables in the dataset
sns.pairplot(data=df)

In [None]:
# We can use hue for pairplot
sns.pairplot(data=df, hue='gender')

In [None]:
# Create pairlot using only some variables
sns.pairplot(data=df[['age', 'avg_glucose_level','bmi','gender']], hue='gender')

## 12. Joint Plot

In [None]:
# Create joint plot that combines scatter plot and histogram.
# A join plot shows two continuous variables' distributions and correlationship simultaneously.
sns.jointplot(x='age', y='bmi', data=df)

In [None]:
# We can use hue for jointplot
sns.jointplot(x='age', y='bmi', data=df, hue='gender')

In [None]:
# Hexplot shows the density of the datapoints
# Darker colors represent high density
sns.jointplot(x='age', y='bmi', data=df, kind='hex')

## 13. Bar Plot

In [None]:
# Create bar plot to compare the average between categories
sns.barplot(x='gender', y='bmi', data=df)

In [None]:
# Set saturation to calibrate color saturation of bars.
sns.barplot(x='gender', y='bmi', data=df, saturation=0.2)

In [None]:
# If you do not want to show the confidence interval, set ci to None 
sns.barplot(x='gender', y='bmi', data=df, saturation=0.2, ci=None)

In [None]:
# Show the percentage in the plot
gbplot = sns.barplot(x='gender', y ='bmi', data=df, saturation = 0.2, ci = None)
for p in gbplot.patches:
    gbplot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

In [None]:
# Raise the max value of y-axis to better appearance
gbplot = sns.barplot(x='gender', y ='bmi', data=df, saturation = 0.2, ci = None)
for p in gbplot.patches:
    gbplot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.ylim(0, 35)
plt.show()

In [None]:
# Insert horizontal line to represent a certain value.
gbplot = sns.barplot(x='gender', y ='bmi', data=df, saturation = 0.2, ci = None)
for p in gbplot.patches:
    gbplot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax = gbplot
ax.axhline(20, color='red') # Set axhline to depict a horizontal line
plt.ylim(0, 35)
plt.show()

In [None]:
# We can use hue for barplot
sns.barplot(x='gender', y='bmi', data=df, saturation=0.2, ci=None, hue='stroke')

In [None]:
# Use point plot to show the same relationship in a simple way.
sns.pointplot(x='gender', y='bmi', data=df, hue="stroke")

## 14. Stacked Bar Plot

In [None]:
# Create a crosstab: location by gender
df_crosstab = pd.crosstab(df['gender'], df['Residence_type'])
df_crosstab

In [None]:
# Create a simple stacked bar plot
# We create from the crosstable
df_crosstab.plot.bar(stacked=True, linewidth=0.0)

In [None]:
# Show horizontal stacked bar plot
df_crosstab.plot.barh(stacked=True, linewidth=0.0)

In [None]:
# Insert the frequencies in the plot
df_crosstab.plot.bar(stacked=True, linewidth=0.0)

ax = plt.subplot()
for rect in ax.patches:
    if rect.get_height() > 0:
        xval = rect.get_x() + rect.get_width() / 2
        yval = rect.get_y() + rect.get_height() / 2
        value = f"{rect.get_height():.0f}"
        ax.text(xval, yval, value, color="w", 
                     ha="center", va="center")

plt.show()

In [None]:
# Create a 100% stacked bar plot
pd.crosstab(df['gender'], df['Residence_type'], normalize='index').plot.bar(stacked=True)

In [None]:
# Move the legend setting the coordinate
pd.crosstab(df['gender'], df['Residence_type'], normalize='index').plot.bar(stacked=True, linewidth=0.0)
plt.legend(bbox_to_anchor=(1.25, 0.9))

In [None]:
# Insert the percentage in the plot
pd.crosstab(df['gender'], df['Residence_type'], normalize='index').plot.bar(stacked=True, linewidth=0.0)
plt.legend(bbox_to_anchor=(1.25, 0.9))
ax = plt.subplot()
for rect in ax.patches:
    if rect.get_height() > 0:
        xval = rect.get_x() + rect.get_width() / 2
        yval = rect.get_y() + rect.get_height() / 2
        value = f"{rect.get_height():.3f}"
        ax.text(xval, yval, value, color="w", ha="center", va="center")
     
plt.show()

# 15. Pie Chart

In [None]:
# Create a relative frequency table of smoking_status
df_smoke = pd.DataFrame(df['smoking_status'].value_counts()/len(df))

# Create a column "smoking"
df_smoke['smoking'] = df_smoke.index
df_smoke

In [None]:
# Create a pie chart
plt.rcParams['figure.figsize'] = (6,6)  # Set the figure size
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.pie(df_smoke['smoking_status'],
       labels=df_smoke['smoking'],
       autopct="%1.1f%%")

plt.show()

In [None]:
# Set the angle where we start the first slice
plt.rcParams['figure.figsize'] = (6,6)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.pie(df_smoke['smoking_status'],
       labels=df_smoke['smoking'],
       autopct="%1.1f%%",
       startangle=90) # Set startangle to 90
plt.show()

In [None]:
# Reorder the categories
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.pie(df_smoke['smoking_status'],
       labels=df_smoke['smoking'],
       autopct="%1.1f%%",
       startangle=90,
       counterclock=False, # Set counterclock to False to order the categories clockwise
       textprops={'size': 'x-large'}) # Enlarge the tex in the plot
plt.show()

In [None]:
# Separate the second slice a bit
explode = (0, 0.1, 0, 0)  # Set the position of the slice

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.pie(df_smoke['smoking_status'],
       labels=df_smoke['smoking'],
       autopct="%1.1f%%",
       startangle=90,
       counterclock=False,
       explode=explode,  # Set explode to reposition the slices
       textprops={'size': 'x-large'})

plt.title('SMOKING STATUS') # Insert plot title
plt.show()

In [None]:
# Create donut pie chart
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.pie(df_smoke['smoking_status'],
       labels=df_smoke['smoking'],
       autopct="%1.1f%%",
       startangle=90,
       counterclock=False,
       textprops={'size': 'large'})

# Add circle in the middle
plt.gca().add_artist(plt.Circle((0, 0), 0.35, color='white'))

# Add string on the donut center
sumstr = 'Total = ' + str(len(df))

ax.text(0., 0., sumstr, horizontalalignment='center',
        verticalalignment='center', size=14)

ax.axis('equal')  
plt.title("SMOKING STATUS", y=1)
plt.show()

# 16. Mosaic Plot

In [None]:
from statsmodels.graphics.mosaicplot import mosaic

# Create a mosaic plot
mosaic(df, ['Residence_type','smoking_status'],
       gap=0.01,
       properties={'edgecolor':'white'},
       title='Smoking Status by Residence Type')

plt.show()

# 17. Heatmap

In [None]:
# Create a heatmap
plt.figure(figsize=(8, 6))
cor = df.corr()
sns.heatmap(cor)

In [None]:
# Change the color and set the min and max of the color bar
plt.figure(figsize=(8, 6))
sns.heatmap(cor, vmin = -1.0, vmax = 1.0, cmap="BrBG")

In [None]:
# Insert correlation coefficients
plt.figure(figsize=(8, 6))
sns.heatmap(cor, vmin = -1.0, vmax = 1.0, cmap="BrBG",
            annot=True)

In [None]:
# Mask the upper triangle
plt.figure(figsize=(8, 6))
mask = np.triu(np.ones_like(cor, dtype=bool))
sns.heatmap(cor, vmin = -1.0, vmax = 1.0, cmap="Reds",
            annot=True,
            mask=mask)