In this notebook, I will share some functions that I generally use for EDA step. Data analysis is an iterative process, you need to visualize features on every step. If you more features, you should define some functions for easy use. 


This notebook is just a template for me.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)

In [None]:
titanic = pd.read_csv("../input/titanic/train.csv") #Classification
house_prices = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv") #Regression
creditcard = pd.read_csv("../input/creditcardfraud/creditcard.csv") #classification
heart = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv") #classification
kc_house = pd.read_csv("../input/housesalesprediction/kc_house_data.csv") #regression
customers = pd.read_csv("../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv") #clustering
diabetes = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv") #classification
videogames_sales = pd.read_csv("../input/videogamesales/vgsales.csv")
mushroom = pd.read_csv("../input/mushroom-classification/mushrooms.csv") #classification
mobile = pd.read_csv("../input/mobile-price-classification/train.csv") #classification

# 1) Target Distribution For Classification

In [None]:
def target_classification(df, target):
    fig, ax = plt.subplots(figsize = (6, 6))
    
    labels = df[target].value_counts().index.tolist()
    palette = ["#0EB8F1", "#F1480F", "#971194", "#FEE715", "#101820"]

    ax.pie(df[target].value_counts(), labels = labels, autopct = '%1.2f%%', 
           startangle = 180, colors = palette[: len(labels)])

    ax.set_title(target)
    plt.show()

In [None]:
target_classification(titanic, "Survived")

In [None]:
target_classification(mobile, "price_range")

In [None]:
target_classification(creditcard, "Class")

# 2) Plotting Target with Two Features

In [None]:
def two_feature_classification(df, target, f1, f2):
    
    fig, ax = plt.subplots(figsize=(15, 8))
    ax.set_facecolor("#393838")

    X = df.drop(target, axis = 1)
    y = df[target].values
    
    labels = df[target].value_counts().index.tolist()

    ax.scatter(X.loc[y == 0, f1], X.loc[y == 0, f2], label = labels[0], alpha = 1, linewidth = 0, c = "#0EB8F1")
    ax.scatter(X.loc[y == 1, f1], X.loc[y == 1, f2], label = labels[1], alpha = 1, linewidth = 0, c = '#F1480F', marker = "X")

    ax.set_title("Distribution of " + target + " w.r.t " + f1 + " and " + f2)
    ax.set_xlabel(f1); ax.set_ylabel(f2)
    ax.legend()
    sns.despine(top = True, right = True, left = True, bottom = True)
    plt.show()

In [None]:
two_feature_classification(creditcard, "Class", "V8", "V11")

In [None]:
two_feature_classification(heart, "output", "thalachh", "oldpeak")

In [None]:
two_feature_classification(diabetes, "Outcome", "BMI", "Glucose")

# 3) Target Distribution For Regression

In [None]:
def feature_distribution(df, col):
    
    from scipy import stats
    
    skewness = np.round(df[col].skew(), 3)
    kurtosis = np.round(df[col].kurtosis(), 3)

    fig, axes = plt.subplots(1, 3, figsize = (21, 7))
    
    sns.kdeplot(data = df, x = col, fill = True, ax = axes[0], color = "#603F83", linewidth = 2)
    sns.boxplot(data = df, y = col, ax = axes[1], color = "#603F83",
                linewidth = 2, flierprops = dict(marker = "x", markersize = 3.5))
    stats.probplot(df[col], plot = axes[2])

    axes[0].set_title("Distribution \nSkewness: " + str(skewness) + "\nKurtosis: " + str(kurtosis))
    axes[1].set_title("Boxplot")
    axes[2].set_title("Probability Plot")
    fig.suptitle("For Feature:  " + col)
    
    for ax in axes:
        ax.set_facecolor("#C7D3D4FF")
        ax.grid(linewidth = 0.1)
    
    axes[2].get_lines()[0].set_markerfacecolor('#8157AE')
    axes[2].get_lines()[0].set_markeredgecolor('#603F83')
    axes[2].get_lines()[0].set_markeredgewidth(0.1)
    axes[2].get_lines()[1].set_color('#F1480F')
    axes[2].get_lines()[1].set_linewidth(3)
    
    sns.despine(top = True, right = True, left = True, bottom = True)
    plt.show()

In [None]:
feature_distribution(house_prices, "SalePrice")

In [None]:
feature_distribution(videogames_sales, "Global_Sales")

In [None]:
feature_distribution(kc_house, "price")

# 4) Categorical Variable - Categorical Target

In [None]:
def count_percentage(df, col, hue):

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(22, 6))
    order = sorted(df[col].unique())
    palette = ["#0EB8F1", "#F1480F", "#971194", "#FEE715", "#101820"]
    
    sns.countplot(col, data = df, hue = hue, ax = ax1, order = order, palette = palette[: df[hue].nunique()])
    ax1.set_title("Counts For Feature:\n" + col)

    df_temp = df.groupby(col)[hue].value_counts(normalize = True).\
    rename("percentage").\
    reset_index()
    
    fig = sns.barplot(x = col, y = "percentage", hue = hue, data = df_temp, ax = ax2, 
                      order = order, palette = palette[: df[hue].nunique()])
    fig.set_ylim(0,1)
    
    fontsize = 14 if len(order) <= 10 else 8
    for p in fig.patches:
        
        txt = "{:.1f}".format(p.get_height() * 100) + "%"
        txt_x = p.get_x() 
        txt_y = p.get_height()
        fig.text(txt_x + 0.125, txt_y + 0.02,txt, fontsize = fontsize)

    ax2.set_title("Percentages For Feature: \n" + col)
    plt.setp(ax1.get_xticklabels(), rotation=70, horizontalalignment='right')
    plt.setp(ax2.get_xticklabels(), rotation=70, horizontalalignment='right')
    
    for ax in [ax1, ax2]:
        ax.set_facecolor("#C7D3D4FF")
        ax.grid(linewidth = 0.1)

In [None]:
count_percentage(titanic, "Sex", "Survived")

In [None]:
count_percentage(mobile, "touch_screen", "price_range")

In [None]:
count_percentage(mushroom, "gill-color", "class")

# 5) Numerical Variable - Categorical Target

In [None]:
def feature_dist_clas(df, col, hue):
    
    fig, axes = plt.subplots(1, 4, figsize = (25, 5))
    order = sorted(df[hue].unique())
    palette = ["#0EB8F1", "#F1480F", "#971194", "#FEE715", "#101820"]
    
    sns.histplot(x = col, hue = hue, data = df, ax = axes[0], palette = palette[: df[hue].nunique()], edgecolor="black", linewidth=0.5)
    sns.kdeplot(x = col, hue = hue, data = df, fill = True, ax = axes[1], palette = palette[: df[hue].nunique()], linewidth = 2)
    sns.boxplot(y = col, hue = hue, data = df, x = [""] * len(df), ax = axes[2], 
                palette = palette[:len(order)], linewidth = 2, flierprops = dict(marker = "x", markersize = 3.5))
    
    sns.violinplot(y = col, hue = hue, data = df, x = [""] * len(df), ax = axes[3], palette = palette[: df[hue].nunique()])
    
    fig.suptitle("For Feature:  " + col)
    axes[0].set_title("Histogram For Feature " + col)
    axes[1].set_title("KDE Plot For Feature " + col)   
    axes[2].set_title("Boxplot For Feature " + col)   
    axes[3].set_title("Violinplot For Feature " + col)   
    
    for ax in axes:
        ax.set_facecolor("#C7D3D4FF")
        ax.grid(linewidth = 0.1)

In [None]:
feature_dist_clas(diabetes, "BloodPressure", "Outcome")

In [None]:
feature_dist_clas(heart, "chol", "output")

In [None]:
feature_dist_clas(titanic, "Fare", "Survived")

In [None]:
feature_dist_clas(mobile, "battery_power", "price_range")

# 6) Categorical Variable - Numerical Target

In [None]:
def bar_box(df, col, target):
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharex = True)
    
    order = sorted(df[col].unique())
    palette = ["#0EB8F1", "#F1480F", "#971194", "#FEE715", "#101820", "#008B97", "#F1480F",
               "#9D9301",  "#4C00FF", "#FF007B", "#00EAFF", "#9736FF", "#FFEE00", "#8992F3",
               "#282828", "#FFEF63", "#80004C", "#CFF839"]

    sns.countplot(data = df, x = col, ax = axes[0], order = order, palette = palette[: len(order)])   
    sns.boxplot(data = df, x = col, ax = axes[1], y = target, order = order, palette = palette[: len(order)],
                flierprops = dict(marker = "x", markersize = 3.5))
    
    fig.suptitle("For Feature:  " + col)
    axes[0].set_title("Countplot For " + col)
    axes[1].set_title(col + " --- " + target)
    
    for ax in axes:
        ax.set_facecolor("#C7D3D4FF")
        ax.grid(linewidth = 0.1)
        plt.sca(ax)
        plt.xticks(rotation = 90)

In [None]:
bar_box(house_prices, "MSSubClass", "SalePrice")

In [None]:
bar_box(kc_house, "grade", "price")

# 7) Numerical Variable - Numerical Target

In [None]:
def plot_scatter(df, col, target):
    
    corr = df[[col, target]].corr()[col][1]    
    c = ["#EB0000"] if corr >= 0.7 else (["#800000"] if corr >= 0.3 else\
                                    (["#FF6363"] if corr >= 0 else\
                                    (["#000EAA"] if corr <= -0.7 else\
                                    (["#3845D3"] if corr <= -0.3 else ["#6CAAFA"]))))    

    fig, ax = plt.subplots(figsize = (10, 6))
    ax.set_facecolor("#C7D3D4FF")
    ax.grid(linewidth = 0.1)
    
    sns.scatterplot(x = col, y = target, data = df, c = c, ax = ax, edgecolor = "black")        
    ax.set_title("Correlation between " + col + " and " + target + " is: " + str(corr.round(4)))

In [None]:
plot_scatter(house_prices, "EnclosedPorch", "SalePrice")

In [None]:
plot_scatter(house_prices, "LotArea", "SalePrice")

In [None]:
plot_scatter(kc_house, "sqft_above", "price")

In [None]:
plot_scatter(kc_house, "sqft_living", "price")

In [None]:
feature_distribution(house_prices, "GrLivArea")

In [None]:
feature_distribution(kc_house, "sqft_above")

# 8) Heatmap

In [None]:
def heatmap(df):
    
    fig, ax = plt.subplots(figsize = (15, 15))
    
    sns.heatmap(df.corr(), cmap = "coolwarm", annot = True, fmt = ".2f", annot_kws = {"fontsize": 9},
                vmin = -1, vmax = 1, square = True, linewidths = 0.01, linecolor = "black", cbar = False)
    
    sns.despine(top = True, right = True, left = True, bottom = True)

In [None]:
heatmap(kc_house)