In [None]:
from csv import DictReader
from collections import Counter
from functools import partial
from math import isnan
from pathlib import Path
import re
import sys

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
from pandas import DataFrame, read_csv, concat
import seaborn as sns


%matplotlib notebook
%precision 4

BASEDIR = Path('../data')

In [None]:
with open(BASEDIR / 'prs.csv') as fd:
    df = read_csv(fd, low_memory=False)
df.shape

In [None]:
report_types = df.ReportType.unique()
documents_by_type = \
    {t:df.loc[df.ReportType==t].dropna(axis=1, how='all')
     for t in report_types}


In [None]:
for t in documents_by_type:
    print(t, len(documents_by_type[t].Project_Code.unique()))

In [None]:
exists_by_type = \
    {t: documents_by_type[t].notna().values.T
     for t in documents_by_type
    }

cross_corr_by_type = \
    {t: np.corrcoef(exists_by_type[t])
     for t in exists_by_type
    }

for t in cross_corr_by_type:
    fig = plt.figure(figsize=(20,20))
    m = cross_corr_by_type[t]
    _ = sns.heatmap(m, xticklabels=labels_by_type[t], yticklabels=labels_by_type[t])
    _ = plt.xticks(rotation=70)
    _ = plt.title(f'{t} ({len(labels_by_type[t])} labels)')

In [None]:
counts = np.sum(e, axis=1)

indexes = np.arange(m.shape[0])

width = .8
plt.bar(indexes, counts, width)
plt.xticks(indexes - 0.5 * width, labels, rotation=70)
plt.show()

In [None]:

lb, ub = .45, .99
scaled = counts / counts.max()
idx = (lb <= scaled) & (scaled <= ub)
np.unique(idx)

indexes = np.arange(np.sum(idx))
width = .8

fig = plt.figure()
plt.bar(indexes, counts[idx], width, align='center')
plt.xticks(indexes - .5 * width , labels[idx], rotation=70)
plt.show()

fig = plt.figure()
_ = sns.heatmap(m[idx,:][:, idx],
                xticklabels=labels[idx], 
                yticklabels=labels[idx])
_ = plt.xticks(rotation=70)

In [None]:
idx = np.argsort(np.nanmax(m, axis=1))
labels[idx[:15]]