In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style='darkgrid')

## Function for count plot 

In [2]:
def count_plot(x,data):
    sns.catplot(x=x, data = data, kind='count', aspect=1.3)
    plt.title("Count plot of "+x)
    plt.show()

## Function for histogram and box plot

In [3]:
def hist_box_plot(x, data, bins=30):
    fig, axs = plt.subplots(1, 2, figsize=(10, 4))
    sns.histplot(x = x, data=data, ax=axs[0], bins=bins, color='red')
    sns.boxplot(x = x, data=data, ax=axs[1], color='#c8a2c8')
    fig.suptitle('Histogram and boxplot of '+ x, fontsize=16)
    plt.show()

## Function for line plot

In [4]:
def line_plot(x,y,data):
    sns.lineplot(x=x, y=y, 
             data=data, color='#963634',)
    plt.xticks(rotation = 45)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()


## Remove feature witu 80% null values

In [5]:
def null_80(df):
    missing_percentages = df.isnull().mean()
    columns_to_drop = missing_percentages[missing_percentages >= .7].index
    df = df.drop(columns=columns_to_drop)
    return df

## Fill the missing values

In [6]:
def fill_numeric_missing_values(df,columns):
    for column in columns:
        column_mean = df[column].median()
        df[column].fillna(column_mean, inplace=True)
    return df

In [7]:
def fill_cat_missing_values(df,columns):
    for column in columns:
        df[column].fillna(df[column].mode()[0], inplace=True)
    return df


## Remove outliers from a column 

In [9]:
def remove_outliers_iqr(df, columns, threshold=1.5):
    # Calculate the lower and upper bounds for each column
    lower_bounds = df.quantile(0.25) - threshold * (df.quantile(0.75) - df.quantile(0.25))
    upper_bounds = df.quantile(0.75) + threshold * (df.quantile(0.75) - df.quantile(0.25))
    
    # Remove outliers for each column
    for column in columns:
        lower_bound = lower_bounds[column]
        upper_bound = upper_bounds[column]
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    return df

## Correlation heatmap

In [10]:
def corr_heatmap(df,columns):
    corr_df = df[columns].corr().abs()
    mask = np.triu(np.ones_like(corr_df, dtype=bool))
    tri_df = corr_df.mask(mask)
    to_drop = [c for c in tri_df.columns if any(tri_df[c] >=  .8)]
    reduced_df = df.drop(to_drop, axis=1)
    sns.heatmap(tri_df, annot=True, fmt='.2g')
    plt.show()
    return reduced_df