# An Assortment of Utility functions

### Data Analysis functions

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def check_missing_data(df):
    '''
    Finds all columns with missing values. 
    If any columns are found, prints out each column and the percent of the column that is missing
    '''
    
    missing_data = df.isnull().sum()
    total_rows = len(df)
    missing_columns = missing_data[missing_data > 0]
    
    if len(missing_columns) > 0:
        print("Missing data found:")
        for column, count in missing_columns.items():
            percent_missing = (count / total_rows) * 100
            print(f"Column '{column}': {count} missing values ({percent_missing:.2f}% of total)")
    else:
        print("No missing data found.")

In [None]:
def draw_correlation_matrix(data):
    '''
    Plots a correlation matrix for all columns in the dataset
    '''
    
    correlation_matrix = data.corr()

    plt.figure(figsize=(14, 12))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Matrix - Titanic Dataset')
    plt.show()

In [None]:
def print_unique_values(df, column_name):
    '''
    Prints out all unique values in column_name
    '''
    unique_values = df[column_name].unique()
    print(f"Unique values in column '{column_name}':")
    print(unique_values)

In [None]:
def display_col_distribution(df, column_name):
    '''
    plots a histogram for distribution of column_name
    '''
    plt.figure(figsize=(8, 6))
    plt.hist(df[column_name], bins=20, color='skyblue', edgecolor='black')  # Adjust the number of bins as needed
    plt.title(f'Distribution of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [4]:
from tabulate import tabulate
def get_column_value_statistics(df, column_name):
    '''
    prints the following statistics for column_name:
    mean, std(standard deviation), min, max,
    IQR(5%, 25%, 75%, 90%, 95%, 99%)
    '''
    
    column = df[column_name]
    
    statistics = {
        "mean": column.mean(),
        "std": column.std(),
        "min": column.min(),
        "5%": column.quantile(0.05),
        "25%": column.quantile(0.25),
        "50%": column.median(),
        "75%": column.quantile(0.75),
        "90%": column.quantile(0.90),
        "95%": column.quantile(0.95),
        "99%": column.quantile(0.99),
        "max": column.max()
    }
    table = tabulate(statistics.items(), headers=["Statistic", "Value"], tablefmt="plain")

    
    print(f'{column_name} stats:\n{table}')

In [None]:
def print_outliers(df, column_name, threshold=10):
    column = df[column_name]
    q1 = column.quantile(0.25)
    q3 = column.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - threshold * iqr
    upper_bound = q3 + threshold * iqr
    outliers = df[(column < lower_bound) | (column > upper_bound)][column_name].tolist()
    
    print("Outliers in column '{}':".format(column_name))
    print(outliers)


### Data processing functions

In [None]:
def remove_outliers(df, column_name, threshold=10):
    '''
    Deletes all rows with outliers in column_name. 
    Warning: will delete the entire row even if the only outlier in that row is in column_name
    '''
    column = df[column_name]
    q1 = column.quantile(0.25)
    q3 = column.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - threshold * iqr
    upper_bound = q3 + threshold * iqr
    df_no_outliers = df[(column >= lower_bound) & (column <= upper_bound)]
    return df_no_outliers

In [None]:
def winsorize_outliers(df, column_name, lower_pct=0.03, upper_pct=0.97):
    '''
    Sets all values outside the percentile ranges to the value at the lower and upper percentile
    '''
    
    column = df[column_name]
    lower_limit = column.quantile(lower_pct)
    upper_limit = column.quantile(upper_pct)
    winsorized_column = column.clip(lower=lower_limit, upper=upper_limit)
    df_winsorized = df.copy()
    df_winsorized[column_name] = winsorized_column
    
    return df_winsorized

In [None]:
def winsorize2_outliers(df, column_name, lower_pct=0.01, upper_pct=0.99):
    '''
    Sets all values outside the percentile ranges to the value at the lower and upper percentile
    '''
    
    column = df[column_name]
    lower_limit = column.quantile(lower_pct)
    upper_limit = column.quantile(upper_pct)
    winsorized_column = column.clip(lower=lower_limit, upper=upper_limit)
    df_winsorized = df.copy()
    df_winsorized[column_name] = winsorized_column
    
    return df_winsorized

In [None]:
def discretize_column(df, column_name, ranges, labels):
    '''
    Ranges in the form of: [element 1, element 2, ..., element n]
    Bin generated in the form of: [min-element 1, element 2 - element 3,..., element n - max]
    labels must be of length n + 1, where n is length of ranges. DO NOT USE STRINGS AS LABELS
    '''
    
    # Define bins using the provided ranges
    bins = [-float('inf')] + ranges + [float('inf')]
    # Define labels for each range
    bins_labels = labels
    
    # Discretize the column based on the defined bins and labels
    df_discretized = df.copy()
    df_discretized[column_name] = pd.cut(df[column_name], bins=bins, labels=bins_labels, include_lowest=True, right=False)
    
    return df_discretized