# Event log correlations 

In [None]:
### IMPORT ###
from pathlib import Path
import pandas as pd
from scipy.stats import pearsonr, shapiro, kstest, norm, probplot, spearmanr
import matplotlib.pyplot as plt
import seaborn as sns


### LOCAL IMPORT ###
from config import config_reader

In [None]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
log_dir = str(yaml_config["LOG_DIR"])
csv_sep = ","
# INPUT
log_file_name = "edu_event_log_PAGE_raw_filtered_terziles-DEF-332-cases.csv" # Set the file name

# FUNCTIONS

In [None]:
def analyse_correlation(df: pd.DataFrame, col1: str, col2: str, qqplot = False) -> None:
    """
    Analyse the distribution and correlation of two columns in a dataframe.
    
    Parameters:
        df (pd.DataFrame): The input dataframe.
        col1 (str): Name of the first column.
        col2 (str): Name of the second column.
        
    Returns:
        None
    """

    # Check for missing values in both columns
    nan_count_col1 = df[col1].isna().sum()
    nan_count_col2 = df[col2].isna().sum()
    print(f"Missing values in {col1}: {nan_count_col1}")
    print(f"Missing values in {col2}: {nan_count_col2}")

    # Remove rows with missing values in either of the two columns
    df = df.dropna(subset=[col1, col2])
    
    # Visualisation with histogram and density plot
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.histplot(df[col1], kde=True)
    plt.title(f"Distribution of '{col1}'")

    plt.subplot(1, 2, 2)
    sns.histplot(df[col2], kde=True)
    plt.title(f"Distribution of '{col2}'")
    plt.show()

    # Q-Q plot for both columns
    if qqplot == True:
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        probplot(df[col1], dist="norm", plot=plt)
        plt.title(f"Q-Q Plot of {col1}")

        plt.subplot(1, 2, 2)
        probplot(df[col2], dist="norm", plot=plt)
        plt.title(f"Q-Q Plot of {col2}")
        plt.show()

    # Pearson correlation
    r, p = pearsonr(df[col1], df[col2])

    # Display the results
    print(f"Pearson correlation between '{col1}' and '{col2}': r = {r:.3f}, p {'< .001' if p < 0.001 else f'= {p:.3f}'}")

In [None]:
def compute_spearman_correlation(df: pd.DataFrame, col1: str, col2: str) -> None:
    """
    Compute the Spearman correlation of two columns in a dataframe.
    
    Parameters:
        df (pd.DataFrame): The input dataframe.
        col1 (str): Name of the first column.
        col2 (str): Name of the second column.
        
    Returns:
        None
    """
    # Check for missing values in both columns
    nan_count_col1 = df[col1].isna().sum()
    nan_count_col2 = df[col2].isna().sum()
    print(f"Missing values in {col1}: {nan_count_col1}")
    print(f"Missing values in {col2}: {nan_count_col2}")

    # Remove rows with missing values in either of the two columns
    df = df.dropna(subset=[col1, col2])

    # Spearman correlation
    r, p = spearmanr(df[col1], df[col2])

    # Display the results
    print(f"Spearman correlation between '{col1}' and '{col2}' (œÅ) = {r:.3f}, p {'< .001' if p < 0.001 else f'= {p:.3f}'}")

# MAIN

In [None]:
print(">> Setings")
print("Input file:", log_file_name)
path_log_file = Path(log_dir) /log_file_name 
print("Path file:", path_log_file)

In [None]:
print(">> Reading")
dic_t = {'Case ID':object, 'CaseLength':int, 'SUS_Tercile':int, 'Apprendimento percepito_Tercile':int, 'UEQ - Overall_Tercile':int} # forza il tipo delle colonne
df_log = pd.read_csv(path_log_file, sep=csv_sep, dtype=dic_t)

In [None]:
print(">> Removing columns from event log")
col_del_list = ['Variant', 'Variant index']
for col_del in col_del_list:
  if col_del in df_log.columns:
    print("Removig column:", col_del)
    df_log.drop(columns=col_del, inplace=True)

In [None]:
df_log.shape

In [None]:
print("Distinct cases:", df_log["Case ID"].nunique())

In [None]:
df_log.head(5)

In [None]:
df_log.columns

## Classes

In [None]:
df_log["Class"].unique()

## Pearson

In [None]:
analyse_correlation(df_log, "SUS", "Apprendimento percepito")

In [None]:
analyse_correlation(df_log, "UEQ - Overall", "Apprendimento percepito")

In [None]:
analyse_correlation(df_log, "QuizAnswerCorrectRatioOverAll", "Apprendimento percepito")

## Spearman

In [None]:
compute_spearman_correlation(df_log, "SUS", "Apprendimento percepito")

In [None]:
compute_spearman_correlation(df_log, "UEQ - Overall", "Apprendimento percepito")

In [None]:
analyse_correlation(df_log, "QuizAnswerCorrectRatioOverAll", "Apprendimento percepito")