In [1]:


from corner import corner
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as ss
import seaborn as sns


def plot_relational_plot(df):
    
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.scatterplot(data=df, x='math score', y='reading score',
                    hue='gender', style='test preparation course', ax=ax)
    ax.set_title('Math vs Reading Scores by Gender and Test Prep Course')
    ax.set_xlabel('Math Score')
    ax.set_ylabel('Reading Score')
    plt.tight_layout()
    plt.savefig('relational_plot.png')
    plt.close()
    return


def plot_categorical_plot(df):
    
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.barplot(data=df, x='gender', y='math score', hue='test preparation course', ax=ax)
    ax.set_title('Average Math Score by Gender and Test Prep Course')
    ax.set_xlabel('Gender')
    ax.set_ylabel('Average Math Score')
    plt.tight_layout()
    plt.savefig('categorical_plot.png')
    plt.close()
    return


def plot_statistical_plot(df):
    
    fig, ax = plt.subplots(figsize=(8, 6))
    corr = df[['math score', 'reading score', 'writing score']].corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", ax=ax)
    ax.set_title('Correlation Heatmap of Scores')
    plt.tight_layout()
    plt.savefig('statistical_plot.png')
    plt.close()
    return


def statistical_analysis(df, col: str):
   
    mean = df[col].mean()
    stddev = df[col].std()
    skew = ss.skew(df[col])
    excess_kurtosis = ss.kurtosis(df[col])  # Fisher definition (0 for normal)
    return mean, stddev, skew, excess_kurtosis


def preprocessing(df):
    
    print("===== Data Overview =====")
    print(df.head(), "\n")
    print(df.describe(), "\n")
    print("===== Correlation Matrix =====")
    print(df[['math score', 'reading score', 'writing score']].corr(), "\n")
    return df


def writing(moments, col):
    
    mean, stddev, skew, kurt = moments
    print(f"For the attribute '{col}':")
    print(f"Mean = {mean:.2f}, "
          f"Standard Deviation = {stddev:.2f}, "
          f"Skewness = {skew:.2f}, and "
          f"Excess Kurtosis = {kurt:.2f}.")

    # Interpretation based on skewness and kurtosis
    if skew > 0.5:
        skewness_type = "right-skewed"
    elif skew < -0.5:
        skewness_type = "left-skewed"
    else:
        skewness_type = "approximately symmetric"

    if kurt > 1:
        kurtosis_type = "leptokurtic (peaked)"
    elif kurt < -1:
        kurtosis_type = "platykurtic (flat)"
    else:
        kurtosis_type = "mesokurtic (normal-like)"

    print(f"The data is {skewness_type} and {kurtosis_type}.\n")
    return


def main():
    
    df = pd.read_csv('data.csv')
    df = preprocessing(df)
    col = 'math score'

    plot_relational_plot(df)
    plot_statistical_plot(df)
    plot_categorical_plot(df)

    moments = statistical_analysis(df, col)
    writing(moments, col)
    return


if __name__ == '__main__':
    main()


===== Data Overview =====
   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75   

       math score  reading score  writing score
count  1000.00000    1000.000000    1000.000000
mean     66.08900      69.169000   