Imports

In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import base64
import scipy.stats

Methods

In [97]:
# Dist TU
def dist_T_U(col1, col2, a1, a2):
    col1Data = np.array(col1[a2].fillna(0))
    cleanCol1 = np.nan_to_num(col1Data)
    p1 = scipy.stats.normaltest(cleanCol1, axis=0, nan_policy='propagate')[1]

    col2Data = np.array(col2[a2].fillna(0))
    cleanCol2 = np.nan_to_num(col2Data)
    p2 = scipy.stats.normaltest(cleanCol2, axis=0, nan_policy='propagate')[1]

    if p1 > 0.05 and p2 > 0.05:
        R, P = scipy.stats.ttest_ind(col1Data, col2Data)
        typeOfTest = "T-TEST"
    else:
        R, P = scipy.stats.mannwhitneyu(col1Data, col2Data)
        typeOfTest = "U-TEST"


    tableRow = [a1, a2, typeOfTest, R, p1 > 0.05 and p2 > 0.05, P, P <= 0.05]
    return tableRow

# ANOVA
def dist_ANOVA_KRUSKAL(data, a1, a2):
    normalities = []
    arrOfData = []

    for d in data.values():
        rowData = np.array(d[a2].fillna(0))
        arrOfData.append(rowData)
        attr = np.nan_to_num(rowData)
        normalities.append(scipy.stats.normaltest(attr, axis=0, nan_policy='propagate')[1])

    normal = all(number > 0.05 for number in normalities)
    if normal:
        R, P = scipy.stats.f_oneway(arrOfData[0], arrOfData[1], arrOfData[2])
        typeOfTest = "ANOVA-TEST"
    else:
        R, P = scipy.stats.kruskal(arrOfData[0], arrOfData[1], arrOfData[2])
        typeOfTest = "KRUSKAL_WALLIS-TEST"


    tableRow = [a1, a2, typeOfTest, R, normal, P, P <= 0.05]
    return tableRow

Reading data

In [98]:
df = pd.read_excel('data/BazaIzobrazevanje.xls')
dfGender = {'man': df[df["Spol"].str.startswith("m")],
                    'woman': df[df["Spol"].str.startswith("ž")]
                    }

dfRac_doma = {'da': df[df["Rac_doma_pred"].str.startswith("d")],
                      'ne': df[df["Rac_doma_pred"].str.startswith("n")]
                    }

modules = ["M1Predtest", "M2Predtest", "M3Predtest", "M4Predtest", "M1Potest", "M2Potest", "M3Potest", "M4Potest"]

Dist TU

In [99]:
boxplots1 = []
table1 = []
for module in modules:
    tableRow = dist_T_U(dfGender["man"], dfGender["woman"], "Gender", module)
    table1.append(tableRow)
    ax = sns.boxplot(data=[dfGender["man"][module], dfGender["woman"][module]])
    ax.set_xticklabels(['man', 'woman'])
    plt.title("gender - " + module)
    plt.savefig("modules/gender_" + module + ".png")
    plt.close()
    boxplots1.append("modules/gender_" + module + ".png")

for module in modules:
    tableRow = dist_T_U(dfRac_doma["da"], dfRac_doma["ne"], "Rac_doma_pred", module)
    table1.append(tableRow)
    ax = sns.boxplot(data=[dfRac_doma["da"][module], dfRac_doma["ne"][module]])
    ax.set_xticklabels(['da', 'ne'])
    plt.title("Rac_doma - " + module)
    plt.savefig("modules/Rac_doma_" + module + ".png")
    plt.close()
    boxplots1.append("modules/Rac_doma_" + module + ".png")

T1 = go.Figure(data=[
        go.Table(header=dict(values=["Atribut1", "Atribut2", "Uporabljen test", "Vrednost", "Normalna porazdelitev", "P", "Razlika med skupinama"]),
                        cells=dict(values=np.array([row for row in table1]).T))])
T1.update_layout(margin=dict(r=5, l=5, t=1, b=1), height=350)

dfUporaba_Rac_Pred_vals = df['Uporaba_rac_pred'].unique()
dfUporaba_Rac_Pred = dict()

for value in dfUporaba_Rac_Pred_vals:
    dfUporaba_Rac_Pred[value] = df[df["Uporaba_rac_pred"].str.startswith(value)]


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... continuing anyway, n=18


kurtosistest only valid for n>=20 ... c

ANOVA

In [100]:
table2 = []
boxplots2 = []
for module in modules:
    tableRow = dist_ANOVA_KRUSKAL(dfUporaba_Rac_Pred, "Uporaba_rac_pred", module)
    table2.append(tableRow)
    ax3 = sns.boxplot(data=[dfUporaba_Rac_Pred["nikoli"][module], dfUporaba_Rac_Pred["vsak dan"][module], dfUporaba_Rac_Pred["vsaj enkrat na teden"][module]])
    ax3.set_xticklabels(['nikoli', 'vsak dan', 'vsaj enkrat na teden'])
    plt.title("Uporaba_rac - " + module)
    plt.savefig("modules/Uporaba_rac_" + module + ".png")
    plt.close()
    boxplots2.append("modules/Uporaba_rac_" + module + ".png")

T2 = go.Figure(data=[
        go.Table(header=dict(values=["Atribut1", "Atribut2", "Uporabljen test", "Vrednost", "Normalna porazdelitev", "P", "Razlika med skupinama"]),
                        cells=dict(values=np.array([row for row in table2]).T))])
T2.update_layout(margin=dict(r=5, l=5, t=1, b=1), height=350)

with open("html/t-test_anova.html", "w") as f:
    f.write(T1.to_html(full_html=False, include_plotlyjs='cdn'))
    for p in boxplots1:
        data = open(p, 'rb').read()
        data_base64 = base64.b64encode(data)  # encode to base64 (bytes)
        data_base64 = data_base64.decode()
        f.write('<img src="data:image/jpeg;base64,' + data_base64 + '">')
    f.write(T2.to_html(full_html=False, include_plotlyjs='cdn'))
    for p in boxplots2:
        data = open(p, 'rb').read()
        data_base64 = base64.b64encode(data)  # encode to base64 (bytes)
        data_base64 = data_base64.decode()
        f.write('<img src="data:image/jpeg;base64,' + data_base64 + '">')





kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anyway, n=8


kurtosistest only valid for n>=20 ... continuing anywa