# Spurious St. Gallen - Odd Correlations
Rafael James Novotny, Linus Flury, Dominik Castelberg

In [2]:
import math
import os
import numpy as np
import sklearn as skl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import datetime as dt

## Helpers

In [None]:
#Config
conf_correlation_selection_threshold = 0.8

In [None]:

def getcrossminlevel(dataframes1, dataframes2):
    # Levels: y: 3, m:2, d: 1, h:0, error: -1
    return max(getminlevel(dataframes1), getminlevel(dataframes2))


def getminlevel(dataframes):
    for k in range(4):
        if not pd.isnull(dataframes[k].iloc[0, 0]):
            return k

    return -1


def cropdates(dataframes1, dataframes2):
    min1 = dataframes1.iloc[0, 0]
    min2 = dataframes2.iloc[0, 0]
    max1 = dataframes1.iloc[-1, 0]
    max2 = dataframes2.iloc[-1, 0]

    df_1 = dataframes1.copy()
    df_2 = dataframes2.copy()
    if min1 < min2:
        df_1.iloc[:, 0] = df_1[df_1.iloc[:, 0] >= min2]
    else:
        df_2.iloc[:, 0] = df_2[df_2.iloc[:, 0] >= min1]

    if max1 > max2:
        df_1.iloc[:, 0] = df_1[df_1.iloc[:, 0] <= max2]
    else:
        df_2.iloc[:, 0] = df_2[df_2.iloc[:, 0] <= max1]

    return df_1, df_2


def combinedf(df_1, df_2):
    # check if works
    df_1.set_index(df_1.iloc[:, 0], inplace=True)
    df_2.set_index(df_2.iloc[:, 0], inplace=True)
    df_1 = df_1.drop(df_1.columns[[0]], axis=1)
    df_2 = df_2.drop(df_2.columns[[0]], axis=1)
    df = pd.merge(df_1,df_2, how='inner', left_index=True, right_index=True)
    return df.dropna()


directory = './'

directory = os.fsencode(directory)

dataframes_list = []
counter = 0
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv"):
        df_y = pd.read_csv(filename, sep=';', usecols=[6, 7])
        df_y.iloc[:, 0] = pd.to_datetime(df_y.iloc[:, 0], format='%Y')
        df_m = pd.read_csv(filename, sep=';', usecols=[4, 5])
        df_m.iloc[:, 0] = pd.to_datetime(df_m.iloc[:, 0], format='%Y-%m')
        df_d = pd.read_csv(filename, sep=';', usecols=[2, 3])
        df_d.iloc[:, 0] = pd.to_datetime(df_d.iloc[:, 0], format='%Y-%m-%d')
        df_h = pd.read_csv(filename, sep=';', usecols=[0, 1])
        df_h.iloc[:, 0] = pd.to_datetime(df_d.iloc[:, 0][0], format='%Y-%m-%dT%h')
        dataframes_list.append([df_h, df_d, df_m, df_y, filename])
        counter += 1


corrMatrix = np.zeros((counter, counter))
for i in range(counter):
    for j in range(i+1, counter):
        level = getcrossminlevel(dataframes_list[i], dataframes_list[j])
        if level == -1:
            continue
        # Levels: y: 3, m:2, d: 1, h:0, error: -1
        dframe1, dframe2 = cropdates(dataframes_list[i][level], dataframes_list[j][level])
        df_for_corr = combinedf(dframe1, dframe2)
        corrMatrix[i, j] = df_for_corr.iloc[:, 0].corr(df_for_corr.iloc[:, 1])


np.savetxt('correlationMatrix.csv', corrMatrix, delimiter=';')




In [4]:
# PLOT HELPERS
def plot_stats(df: pd.DataFrame, title: str, x_label: str, y1_label: str, y2_label: str):
    ax1 = df.plot('x','y1', color='b')
    ax1.set_ylabel(y1_label)
    ax1.set_xlabel(x_label)
    ax_twin = ax1.twinx()
    ax2 = df.plot('x','y2',ax=ax_twin, secondary_y=True, color='r')
    ax2.set_ylabel(y2_label)

    plt.title(title)

    plt.show()

#Sample
#test_df = pd.DataFrame({'x' : ['1991-01-01T02','1991-01-02T02','1991-01-04T02','1991-01-05T02'],'y1' : [10,20,30,40],'y2' : [100,50,-30,-50]})
#plot_stats(test_df, 'This is a Title', 'X axis', 'Y1 axis', 'Y2 axis')