In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
from matplotlib.colors import hex2color
import matplotlib.patches as mpatches
from mpl_toolkits.mplot3d import Axes3D

import mpld3
from mpld3 import plugins
mpld3.disable_notebook()

# Define some CSS to control our custom labels
css = """th {color: #000000; background-color: #ffffff;} td {color: #000000; background-color: #ffffff;}"""
from IPython.display import display, HTML

import numpy as np
import pandas as pd
from time import time

from sklearn import manifold
from sklearn.utils import check_random_state
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import spectral_clustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS

from sklearn.datasets import load_digits

from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
import sklearn.preprocessing as preprocessing

from sklearn.decomposition import *
from sklearn import manifold

import scipy.spatial.distance as dist
from scipy import stats
from scipy.sparse import csr_matrix

rescale = preprocessing.MinMaxScaler(feature_range=(-1,1))

In [None]:
def plot_meps(data, title):
    fig, ax = plt.subplots(figsize=(9, 9))
    mep_colour = data['colour']
    points = ax.scatter(data.x, data.y, c=mep_colour, s=40, lw=0.05)
    ax.set_title(str(len(data.x)) + " MEPs in 'Vote Space', using " + title, size=16)
    
    ax.legend(framealpha=0, bbox_to_anchor=(0, 1), loc=2, handles=[
            mpatches.Patch(color='#80BFFF', label='EPP'),
            mpatches.Patch(color='#FF0000', label='S&D'),
            mpatches.Patch(color='#FFFF00', label='ALDE'),
            mpatches.Patch(color='#0000FF', label='ECR'),
            mpatches.Patch(color='#009900', label='G/EFA'),
            mpatches.Patch(color='#FF9900', label='IND_DEM'),
            mpatches.Patch(color='#990000', label='EUL/NGL'),
            mpatches.Patch(color='#DDDDDD', label='NI')
    ])
    
    plt.axis('tight')    
    #plt.savefig(title+'.pdf', format='pdf', bbox_inches='tight')
    plt.show()
    
    #tooltips = []
    #for r in data.iterrows():
    #    label = data[['group','country']].ix[[r[0]], :].T
    #    #print(type(r), len(r), r)
    #    label.columns = ['{0}'.format(r[1][3])]
    #    #print(label)
    #    # .to_html() is unicode; so make leading 'u' go away with str()
    #    tooltips.append(str(label.to_html()))
    
    #tooltip = plugins.PointHTMLTooltip(points, tooltips, voffset=10, hoffset=10, css=css)
    #plugins.connect(fig, tooltip)
    
    #return mpld3.fig_to_html(fig, template_type='general')

In [None]:
def plot_meps3d(data, title):
    # Plot our dataset.
    # Variables for manifold learning.
    fig = plt.figure(figsize=(9, 9))
    ax = fig.add_subplot(111, projection='3d')
    plt.suptitle("%i MEPs in '3d Space', with nmf" % (len(data)), fontsize=14)
    mep_colour = list(np.array(data['colour']))
    x = data['x']
    y = data['y']
    z = data['z']
    ax.scatter(x, y, z, c=mep_colour, marker='o', s=20)
    plt.title("3d NMF ")
    plt.axis('tight')
    ax.legend(bbox_to_anchor=(1.2, 1.00), handles=[
            mpatches.Patch(color='#FFFF00', label='ALDE'),
            mpatches.Patch(color='#0000FF', label='UEN'),
            mpatches.Patch(color='#FF9900', label='IND_DEM'),
            mpatches.Patch(color='#80BFFF', label='EPP-ED'),
            mpatches.Patch(color='#990000', label='EUL_NGL'),
            mpatches.Patch(color='#009900', label='G_EFA'),
            mpatches.Patch(color='#FF0000', label='SOC(S_D)'),
            mpatches.Patch(color='#DDDDDD', label='NI')
    ])
    #plt.savefig(term+'-3d-nmf.png', format='png')
    plt.show()

In [None]:
def load_matrix(f):
    loader = np.load(f)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])

In [None]:
def between_within_dispersion(X, labels):
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    n_samples, _ = X.shape
    n_labels = len(le.classes_)

    results = pd.DataFrame(columns=["group","meps","wgss","bgss"])

    extra_disp, intra_disp = 0., 0.
    mean = np.mean(X, axis=0)
    
    for k in range(n_labels):
        cluster_k = X[labels == k]
        mean_k = np.mean(cluster_k, axis=0)
        g = le.classes_[k]
        bgss = len(cluster_k) * np.sum((mean_k - mean) ** 2)
        wgss = np.sum((cluster_k - mean_k) ** 2)
        extra_disp += bgss
        intra_disp += wgss
        results.loc[k] = (g, len(cluster_k), np.round(wgss,2), np.round(bgss,2))

    results["meps"] = results["meps"].astype(int)
    results = results.sort_values(by="meps", ascending=False)
    results.loc[n_labels+1] = ("total", X.shape[0], np.round(intra_disp,2), np.round(extra_disp,2))
    
    return results

In [None]:
def silhouette_score(data, labels):
    data = np.array(data)
    labels = np.array(labels)
    sscore_avg = metrics.silhouette_score(data, labels, metric='euclidean', sample_size=len(labels))
    return sscore_avg

In [None]:
def silhouette_scores(data, labels):
    data = np.array(data)
    labels = np.array(labels)
    
    sscore_avg = metrics.silhouette_score(data, labels, metric='euclidean', sample_size=len(labels))
    sscore_samples = metrics.silhouette_samples(data, labels, metric='euclidean')

    results = pd.DataFrame(columns=["group","meps","score"])
    for i, g in enumerate(set(labels)):
        group = sscore_samples[labels == g]
        results.loc[i] = (g, len(group), np.round(np.average(group),2))

    results.loc[len(set(labels))+1] = ("total", len(data), np.round(sscore_avg, 2))
    results["meps"] = results["meps"].astype(int)
    return results.sort_values(by="meps", ascending=False)

In [None]:
def project(method, vec_data, dim=2, norm=False, rescale_dims=True, stable=True, model=None):
    if (norm):
        vec_data = preprocessing.normalize(vec_data, norm='l2')
        
    if (stable):
        nmf_model = NMF(init='nndsvd', n_components=dim, random_state=1, alpha=.1, l1_ratio=.5)
        tsne_model = manifold.TSNE(n_components=dim, init='pca', random_state=42, n_iter=1000, perplexity=60, metric='euclidean', learning_rate=200)
    else:
        nmf_model = NMF(init='random', n_components=dim, alpha=.1, l1_ratio=.5)
        tsne_model = manifold.TSNE(n_components=dim, n_iter=1000, perplexity=30, metric='euclidean', learning_rate=200)
   
    methods = {
        "PCA": PCA(n_components=dim),
        "MDS": manifold.MDS(n_components=dim),
        "NMF": nmf_model,
        "TSNE": tsne_model,
        "TSVD": TruncatedSVD(n_components=dim)
    }
    
    if not model:
        model = methods[method]
    
    t_data = model.fit_transform(vec_data)

    if(rescale_dims):
        t_data = rescale.fit_transform(t_data)
    
    print(method, t_data.shape)
    return t_data

In [None]:
def eval_show(vec_data, show=False):
    print("S:", silhouette_score(vec_data, party_l))
    data_disp = between_within_dispersion(vec_data, party_l)
    if (show):
        display(data_disp)
        display(data_disp[data_disp['group']=="total"])
    return data_disp

In [None]:
# Load Data:
term = 'term7'
variant = 'topic' # Yes, No, abstain 1-hot encoded, topic
vote_encoding = 'count'

# Load MEP Data
if (term == 'term6'):
    data = pd.read_csv('../data.processed/meps_term6.csv')
    data = data[~data.duplicated(subset=['ident'])]
    data = data[data['grp'] != 'NI'] # drop Non attached
if (term == 'term7'):
    data = pd.read_csv('../data.processed/meps_term7.csv')
    data = data[~data.duplicated(subset=['ident'])]
    data = data[data['grp'] != 'NI'] # drop Non attached

# Load Vectors:
if (vote_encoding == 'sgns'):
    vocab_file = '../data.processed/'+term+variant+'-votes/sgns.meps'
    vector_file = '../data.processed/'+term+variant+'-votes/sgns.npy'
    vec_data = np.load(vector_file).astype(np.float64)

if (vote_encoding == 'count'):
    vocab_file = '../data.processed/'+term+variant+'-votes/count.meps'
    vector_file = '../data.processed/'+term+variant+'-votes/count.csr.npz'
    vec_data = load_matrix(vector_file).astype(np.float64).T
    vec_data = vec_data.toarray()
    
party_c = list() # colours
party_l = list() # labels

wnom_data = pd.DataFrame(columns=['ident','group','name','colour','x','y'])

ii = 1
with open(vocab_file) as f:
    eval_meps = set(data['ident'])
    rmids = list()
    for i, line in enumerate(f.readlines()): 
        mepident = int(line.strip())
        if (mepident not in eval_meps):
            rmids.append(i)
        else:
            mepgroup=data[data['ident']==mepident]['grp'].iloc[0]
            mepname= data[data['ident']==mepident]['name'].iloc[0]
            mepcol = data[data['ident']==mepident]['col'].iloc[0]
            mepx =   data[data['ident']==mepident]['coord1D'].iloc[0]
            mepy =   data[data['ident']==mepident]['coord2D'].iloc[0]
            if (mepgroup!='NI'):
                party_l.append(data[data['ident']==mepident]['grp'].iloc[0])
                party_c.append(data[data['ident']==mepident]['col'].iloc[0])
                wnom_data.loc[ii] = (mepident, mepgroup, mepname, mepcol, mepx, -mepy)
                ii = ii + 1
            else:
                rmids.append(i)

print ("Deleted", len(set(rmids)), "meps: NI or not in w-nominate")
e_vec_data = np.delete(vec_data, rmids, axis=0)

print ("Loaded", vote_encoding, vec_data.shape, "vectors")
print ('Votes:', vote_encoding, e_vec_data.shape, len(party_l), ' MEPs', len(set(party_l)), ' groups')

In [None]:
wnominate_vec = np.stack((wnom_data['x'], wnom_data['y']), axis=-1)

In [None]:
#count
pca_cvec = project(method="PCA", vec_data=e_vec_data)
nmf_cvec = project(method="NMF", vec_data=e_vec_data)
tsne_cvec = project(method="TSNE", vec_data=e_vec_data)

In [None]:
#sgns
pca_svec = project(method="PCA", vec_data=e_vec_data)
tsne_svec = project(method="TSNE", vec_data=e_vec_data)

In [None]:
vecsshow = [wnominate_vec, pca_cvec, nmf_cvec, tsne_cvec, tsnec_cvec]
vecsnames = ["wnominate", "pca", "nmf", "tsne", "sgns"]

for i, vec in enumerate(vecsshow): 
    eval_show(vec, show=True)
    plot_data = vec.copy()
    plot_data['x'] = vec.T[0]
    plot_data['y'] = vec.T[1]
    plot_meps(plot_data, term + " " + vecsnames[i] + " " + variant) # 7th 

In [None]:
# wgss = within group 
# bgss = between group

show_s, hide_s,  = "bgss", "wgss"
#show_s, hide_s = "wgss", "bgss"

wnominate_displ = eval_show(wnominate_vec).drop([hide_s], axis=1).rename(columns={show_s:'wnom '+show_s})
pca_displ = eval_show(pca_cvec).drop(['meps','group',hide_s], axis=1).rename(columns={show_s:'pca '+show_s})
nmf_displ = eval_show(nmf_cvec).drop(['meps','group',hide_s], axis=1).rename(columns={show_s:'nmf '+show_s})
tsne_displ = eval_show(tsnec_cvec).drop(['meps','group',hide_s], axis=1).rename(columns={show_s:'tsne '+show_s})
sgns_displ = eval_show(tsne_svec).drop(['meps','group',hide_s], axis=1).rename(columns={show_s:'sgns '+show_s})

res = pd.concat([wnominate_displ, pca_displ, nmf_displ, tsne_displ, sgns_displ], axis=1)
display(res)
#print(res.to_latex())

In [None]:
print (term, variant)
results = pd.DataFrame(columns=["dimension","run","method","score"])
for i in range(1,6): # dimensions
    for l in range(0,5): # initializations
        print ("Dimension: ", i , l)
        pca_cvec = project(method="PCA", vec_data=e_vec_data, dim=i, stable=False)
        nmf_cvec = project(method="NMF", vec_data=e_vec_data, dim=i, stable=False)
        tsne_cvec = project(method="TSNE", vec_data=e_vec_data, dim=i, stable=False)
        nmfscore = calinski_harabaz_score(nmf_cvec, party_l)
        pcascore = calinski_harabaz_score(pca_cvec, party_l)
        tsnescore = calinski_harabaz_score(tsne_cvec, party_l)    
        results.loc[len(results)] = (i, l, "PCA", pcascore)
        results.loc[len(results)] = (i, l, "NMF", nmfscore)
        results.loc[len(results)] = (i, l, "TSNE", tsnescore)
        print (i, l, pcascore, nmfscore, tsnescore)
    
results.to_csv("dimensions"+term+variant+".csv")
results

In [None]:
#wnominate_vec, pca_cvec, nmf_cvec, tsne_cvec

fig = plt.figure(figsize=(15, 7))
plt.suptitle("Term 7",fontsize=14)

pt_lw=0.05
pt_size=30

Y = wnominate_vec.copy()
print("w-nominate")
ax = fig.add_subplot(2,5,1)
plt.scatter(Y[:, 0], Y[:, 1], c=party_c, s=pt_size, lw=pt_lw)
plt.title("wnominate")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

Y = pca_cvec.copy()
print("PCA")
ax = fig.add_subplot(2,5,2)
plt.scatter(Y[:, 1], -Y[:, 0], c=party_c, s=pt_size, lw=pt_lw)
plt.title("PCA")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

Y = nmf_cvec.copy()
print("NMF")
ax = fig.add_subplot(2,5,3)
plt.scatter(-Y[:, 1], Y[:, 0], c=party_c, s=pt_size, lw=pt_lw)
plt.title("NMF")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

Y = tsnec_cvec.copy()
print("tSNE")
ax = fig.add_subplot(2,5,4)
plt.scatter(-Y[:, 1], -Y[:, 0], c=party_c, s=pt_size, lw=pt_lw)
plt.title("tSNE")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')


Y = tsne_svec.copy()
print("tSNE(SGNS)")
ax = fig.add_subplot(2,5,5)
plt.scatter(-Y[:, 1], Y[:, 0], c=party_c, s=pt_size, lw=pt_lw)
plt.title("sgns tsne")
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
plt.axis('tight')

#plt.savefig('/home/igor/git/parl2vec/reports/aics/figures/term7.pdf', format='pdf', bbox_inches='tight')
plt.show()