In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import scipy.stats as stats
import missingno as msno

warnings.filterwarnings('ignore')

In [17]:
def norm(x: np.ndarray) -> bool:
    return stats.shapiro(x)[1] > 0.05


def correlation_ratio(categorical_series, numerical_series):
    categories = categorical_series.unique()
    total_mean = numerical_series.mean()

    ss_between = sum(len(numerical_series[categorical_series == category]) *
                     (numerical_series[categorical_series == category].mean() - total_mean) ** 2
                     for category in categories)

    ss_total = sum((numerical_series - total_mean) ** 2)

    eta_squared = ss_between / ss_total if ss_total != 0 else 0

    return np.sqrt(eta_squared)


def corr_matrix(df: pd.DataFrame, numeric: list[str], nominal: list[str]):
    cols = numeric + nominal
    corr_df = pd.DataFrame(index=cols, columns=cols)

    for col1 in cols:
        for col2 in cols:
            if col1 == col2:
                corr_df.loc[col1, col2] = 1.0
            elif col1 in numeric and col2 in numeric:
                norm1 = norm(df[col1].dropna().values)
                norm2 = norm(df[col2].dropna().values)
                if norm1 and norm2:
                    corr_value, _ = stats.pearsonr(df[col1].dropna(), df[col2].dropna())
                else:
                    corr_value, _ = stats.spearmanr(df[col1].dropna(), df[col2].dropna())
                corr_df.loc[col1, col2] = corr_value
                corr_df.loc[col2, col1] = corr_value
            elif col1 in nominal and col2 in nominal:
                contingency_table = pd.crosstab(df[col1], df[col2])
                _, p, _, _ = stats.chi2_contingency(contingency_table)
                corr_df.loc[col1, col2] = p
                corr_df.loc[col2, col1] = p
            else:
                num_col, cat_col = (col1, col2) if col1 in numeric else (col2, col1)
                corr_value = correlation_ratio(cat_col, num_col)
                corr_df.loc[col1, col2] = corr_value
                corr_df.loc[col2, col1] = corr_value

    corr_df = corr_df.astype(float)


    plt.figure(figsize=(10, 8))
    cmap = sns.diverging_palette(220, 20, as_cmap=True)
    sns.heatmap(corr_df, annot=True, fmt=".2f", cmap=cmap, center=0, linewidths=0.5)
    plt.title("Матрица корреляции")
    plt.show()

    return corr_df


def IQR_outliers_remove(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        df = df[(df[column] >= Q1 - 1.5 * IQR) & (df[column] <= Q3 + 1.5 * IQR)]
    return df


In [4]:
df = pd.read_csv("data.csv")
df.columns = df.columns.str.lower().str.replace(" ", "_")