Data path and imports

In [161]:
import pandas as pd
from scipy.stats import chisquare
from scipy.stats import chi2_contingency
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import numpy as np
import plotly.graph_objects as go
DATA_PATH = "data/BazaIzobrazevanje.xls"

Reading linearities

In [162]:
sig = 0.05
linearities = []
with open("type.txt", "r") as f:
    for line in f:
        linearities.append(line.strip().split(" ")[1])

Reading XLS file

In [163]:
df = pd.read_excel(DATA_PATH)

Building columns comparison

In [164]:
columns = []
for col in df.columns[12:]:
    columns.append(col)

Preparing cross table array

In [165]:
ct = [("Spol", "Rac_doma_pred", pd.crosstab(df.Spol, df.Rac_doma_pred, margins=True)),
      ("Spol", "Rac_doma_sedaj", pd.crosstab(df.Spol, df.Rac_doma_sedaj, margins=True)),
      ("Spol", "Internet_pred", pd.crosstab(df.Spol, df.Internet_pred, margins=True)),
      ("Spol", "Internet_sedaj", pd.crosstab(df.Spol, df.Internet_sedaj, margins=True)),
      ("Spol", "Namen1_IskanjeInf", pd.crosstab(df.Spol, df.Namen1_IskanjeInf, margins=True)),
      ("Spol", "Namen2_Komuniciranje", pd.crosstab(df.Spol, df.Namen2_Komuniciranje, margins=True)),
      ("Spol", "Namen3_Nakupovanje", pd.crosstab(df.Spol, df.Namen3_Nakupovanje, margins=True)),
      ("Spol", "Namen4_eStoritve", pd.crosstab(df.Spol, df.Namen4_eStoritve, margins=True)),
      ("Spol", "E_posta", pd.crosstab(df.Spol, df.E_posta, margins=True)),
      ("Spol", "Koristnost_tecaja", pd.crosstab(df.Spol, df.Koristnost_tecaja, margins=True)),
      ("Spol", "Dodatno_izpop", pd.crosstab(df.Spol, df.Dodatno_izpop, margins=True)),
      ("Spol", "Priporocilo_brezpos", pd.crosstab(df.Spol, df.Priporocilo_brezpos, margins=True)),
      ("Spol", "Vsec_nacin", pd.crosstab(df.Spol, df.Vsec_nacin, margins=True))]

Iterating cross tables

In [166]:
table_1 = []
for table in ct:
    try:
        value = np.array([table[2].iloc[0][0:2].values, table[2].iloc[1][0:2].values])
        ch, p = chisquare([table[2].iloc[0][0:2].values, table[2].iloc[1][0:2].values], axis=None)
        table_1.append((table[0], table[1], ch, p, p<=sig))
    except:
        print("Skipping " + table[0] + " - " + table[1])

table_2 = []
for table in ct:
    try:
        value = table[2]
        stat, p, _, _ = chi2_contingency(value,correction=False)
        table_2.append((table[0], table[1], stat, p, p<=sig))
    except:
        print("Skipping " + table[0] + " - " + table[1])

Get correlation

In [167]:
correlations = []
cor_headers = df.columns[2:10]
counter = 0
for i in cor_headers:
    for j in cor_headers:
        if i != j:
            temp = None
            if linearities[counter] == "lin":
                temp = pearsonr(df[i], df[j]) #r, p-value
                correlations.append((i, j,temp[0], temp[1], "pearson"))
            else:
                temp = spearmanr(df[i], df[j])
                correlations.append((i, j,temp[0], temp[1], "spearman"))
            counter += 1

Building tables

In [168]:
T2 = go.Figure(data=[
    go.Table(header=dict(values=["Atribut1", "Atribut2", "Korelacijski koef", "R", "P", "Povezava"]),
                cells=dict(values=np.array([[row[0], row[1], row[4], row[2], row[3], row[3] < sig] for row in correlations]).T))])
T2.update_layout(margin=dict(r=5, l=5, t=3, b=0))

T3 = go.Figure(data=[
    go.Table(header=dict(values=["Atribut1", "Atribut2", "Hi2", "P", "Povezava"]),
                cells=dict(values=np.array([[row[0], row[1], row[2], row[3], row[4]] for row in table_2]).T))])
T3.update_layout(margin=dict(r=5, l=5, t=1, b=2))

AttributeError: module 'plotly.validators.table' has no attribute 'LegendgroupValidator'

Writing HTML file

In [None]:
with open("correlation.html", "w") as f:
    #f.write("Poskus funkcije 'Chisquare'")
    #f.write(T1.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write("Poskus funkcije 'chi2_contingency'")
    f.write(T3.to_html(full_html=False, include_plotlyjs='cdn'))
    for i in ct:
        ct_info = i[2].iloc[0:5].values
        ct_temp = go.Figure(data=[
            go.Table(header=dict(values=[i[0]+"/"+i[1],"Da", "Ne", "Skupaj"]),
                        cells = dict(values=[["Moški", "Ženske", "Skupaj"],
                                            [ct_info[0][0], ct_info[1][0], ct_info[2][0]],
                                            [ct_info[0][1], ct_info[1][1], ct_info[2][1]],
                                            [ct_info[0][2], ct_info[1][2], ct_info[2][2]]]))])
        ct_temp.update_layout(height=100, margin=dict(r=5, l=5, t=1, b=1))

        f.write(ct_temp.to_html(full_html=False, include_plotlyjs='cdn'))

    f.write("Pearsonov test")
    f.write(T2.to_html(full_html=False, include_plotlyjs='cdn'))