In [96]:
import os
import json
import matplotlib.pyplot as plt
from IPython.display import clear_output

# Parsing the Whole Folder

In [97]:
# Reading the whole folder

entities = []

# The index of these four arrays corresponds to one organization

orgs = []
texts = []
years = []
occurences = []

# The template counts the occurencies for each organization

years_template = {
    2011: 0,
    2012: 0,
    2013: 0,
    2014: 0,
    2015: 0,
    2016: 0,
    2017: 0,
    2018: 0,
    2019: 0,
    2020: 0
}

dir = 'data/biomass/'
files = os.listdir(dir)

for index, filename in enumerate(files):

    # Read one file
    
    f = open(dir + filename)
    r = f.read()
    data = json.loads(r)
    
    
    # Set basic metadata

    try:
        year = int(data[0]['publish_date'].split(' ')[0].split('-')[0]) # Set year
        records = data[0]['story_tags'] # Set tags
        # if len(tags) > 1000: continue # Limit the maxiumum number of tags
        
        clear_output(wait=True)
        print(index, '/', len(files), end=' ')
    
    except:
        continue


    # Collect entities
    
    tags = []

    for record in records:
        # If the record is an organization
        if (record['tag_set'] == 'cliff_organizations'):
            tag = record['tag']
            tag = tag.replace('.', '')
            # If the tag starts with lowercase
            if tag[0].islower():
                continue
            tags.append(tag)
    
    
    # Collect entities by name

    for t in tags:

        related = tags.copy()
        related.remove(t)    
    
        if t not in orgs:

            orgs.append(t)
            i = orgs.index(t)

            texts.append(related)

            occurences.append(1)
            
            years.append(years_template.copy())
            years[i][year] += 1
                
        else:

            i = orgs.index(t)

            # print(texts[i])
            texts[i] += related
            # print(texts[i])

            occurences[i] += 1
            
            years[i][year] += 1

    # if index > 10:
    #     raise SystemExit("Stop right there!")


26449 / 26450 

In [98]:
# tests

n = 100

orgs[n]
# texts[n]
# len(texts[n])
# years[n]
occurences[n]


67

# Cleaning

In [99]:

# Remove less citates organizations

for index, occurrence in reversed(list(enumerate(occurences))):

    min = 50

    if occurrence < min:
        orgs.pop(index)
        texts.pop(index)
        years.pop(index)
        occurences.pop(index)
    # else:
        # print()
        # print(index, occurrence, orgs[index])
        # print()

# order years by key in an array of tuples

for index, y in enumerate(years):
    sortedDict = dict( sorted(y.items(), key=lambda x: x[0]) )
    _temp = {}
    for k,v in sortedDict.items():
        _temp[k] = v
    years[index] = _temp

print(len(orgs), len(texts), len(years), len(occurences))

# print(occurences[0])
# print(years[0][:, 0])

257 257 257 257


In [100]:
# Total linear regression

import numpy as np
from sklearn.linear_model import LinearRegression

total_years = {}

for year in years:
    # print(year)
    for k,v in year.items():
        # print(k, v)
        if k in total_years:
            total_years[k] += v
        else:
            total_years[k] = v

y = list(year.values())
x = list(year.keys())
x = np.array(x).reshape((-1, 1))

model = LinearRegression().fit(x, y)
slope = model.coef_
total_slope = slope[0]
# score = model.score(x, y)

total_years, total_slope


({2011: 1197,
  2012: 1135,
  2013: 1945,
  2014: 2613,
  2015: 3265,
  2016: 4031,
  2017: 2663,
  2018: 5059,
  2019: 6712,
  2020: 7735},
 -0.024242424242424242)

In [101]:
# Linear regression

import matplotlib.colors

slopes = []
colors = []

_min = 0
_max = 0


# Slope

for year in years:

    y = list(year.values())
    x = list(year.keys())
    x = np.array(x).reshape((-1, 1))
    
    model = LinearRegression().fit(x, y)
    slope = model.coef_
    slope = slope[0] - total_slope
    # slope = slope[0]
    score = model.score(x, y)
    slopes.append(slope)

    if slope > _max: _max = slope
    if slope < _min: _min = slope

    # print()
    # print(list(year.keys()), y)
    # print('slope', slope, 'score', score)

print('min', _min, 'max', _max)

# Colors

cmap = plt.cm.RdYlBu_r
cmap = plt.cm.coolwarm
# norm = matplotlib.colors.Normalize(vmin=_min, vmax=_max)
norm = matplotlib.colors.DivergingNorm(vmin=_min, vcenter=0, vmax=_max)
# norm = matplotlib.colors.DivergingNorm(vmin=-10, vcenter=0, vmax=10)

for slope in slopes:
    color = cmap(norm(slope))
    colors.append(color)


min -1.739393939393939 max 26.933333333333323


The DivergingNorm class was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use TwoSlopeNorm instead.
  norm = matplotlib.colors.DivergingNorm(vmin=_min, vcenter=0, vmax=_max)


In [102]:
# Term Frequency Matrix

import textacy

doc_term_matrix, dictionary = textacy.representations.build_doc_term_matrix(texts, tf_type="linear", idf_type="smooth")

In [112]:
# UMAP

import umap
from pointgrid import align_points_to_grid

reducer = umap.UMAP(n_components=2, n_neighbors=3, min_dist=0.001, metric='cosine')
# reducer = umap.UMAP(random_state=2, n_components=2, n_neighbors=2, min_dist=0.01, metric='hellinger')

embedding = reducer.fit_transform(doc_term_matrix)
embedding = align_points_to_grid(embedding)
x = embedding[:, 0]; y = embedding[:, 1]

 * creating mesh with size 51 51
 * filling mesh


In [113]:
# Clustering on embedding

import hdbscan

# clusterer = hdbscan.HDBSCAN(min_cluster_size=4, min_samples=3, cluster_selection_epsilon=.5)
clusterer = hdbscan.HDBSCAN(min_cluster_size=3, min_samples=2)
# clusterer = hdbscan.HDBSCAN(cluster_selection_epsilon=0.3, cluster_selection_method='leaf')
# min_samples is to consier all the elements that owtherwide will be classified as noise
# cluster_selection_epsilon extends clusters
clusterer.fit(embedding)
clusters = clusterer.labels_

# Grouping by cluster

values = set(clusters)
if -1 in values: values.remove(-1)

clusters = [[index for index, cluster in enumerate(clusters) if cluster==value] for value in values]

len(clusters)

11

In [114]:
# Plot

import math
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
from scipy import interpolate




# Frame

plt.figure(figsize=(20,20), dpi=300)
plt.axis('off')


# Hulls

for cluster in clusters:

    # Average color
    
    background_color = []
    
    for i, index in enumerate(cluster):
        for occurence in range(occurences[index]):
            background_color.append([colors[index][0], colors[index][1], colors[index][2]])

    r = [i[0] for i in background_color]; r = sum(r) / len(r)
    g = [i[1] for i in background_color]; g = sum(g) / len(g)
    b = [i[2] for i in background_color]; b = sum(b) / len(b)

    background_color = (r, g, b, 1)

    # Hull

    points = []
    for index in cluster:
        points.append([embedding[index][0], embedding[index][1]])
    points = np.array(points)

    # print(points)

    hull = ConvexHull(points)
    
    x_hull = np.append(points[hull.vertices,0], points[hull.vertices,0][0]) # Collect the xs + first x
    y_hull = np.append(points[hull.vertices,1], points[hull.vertices,1][0])

    # print(x_hull)

    # break
    
    # interpolate
    dist = np.sqrt((x_hull[:-1] - x_hull[1:])**2 + (y_hull[:-1] - y_hull[1:])**2)
    dist_along = np.concatenate(([0], dist.cumsum()))
    spline, u = interpolate.splprep([x_hull, y_hull], u=dist_along, s=0)
    interp_d = np.linspace(dist_along[0], dist_along[-1], 50)
    interp_x, interp_y = interpolate.splev(interp_d, spline)
    
    # plot shape
    plt.fill(interp_x, interp_y, '--', c=background_color, alpha=.2)


# Scatterplot

# plt.scatter(x, y, s=occurences, c=colors)
plt.scatter(x, y, s=40, c=colors)


# Labels

for i, txt in enumerate(orgs):
    # text = plt.annotate(orgs[i], xy=(x[i], y[i] - math.sqrt(occurences[i]/math.pi)/40), ha='center', va='bottom')
    text = plt.annotate(orgs[i], xy=(x[i], y[i]), ha='center', va='bottom')
    text.set_fontsize(2)

plt.savefig('/Users/dario/Desktop/download.png')
plt.savefig('download.png')

    



In [107]:
# Contours

from random import random


def f(x, y):
    return np.sin(x) ** 10 + np.cos(10 + y * x) * np.cos(x)

X, Y = np.meshgrid(x, y)

Z = np.zeros((len(X),len(Y)))


for indexX, elX in enumerate(X):
    for indexY, elY in enumerate(Y):
        # continue
        # print(random() * 1000)
        Z[indexX][indexY] = random() * 100
Z

# The proble is that I want to use "slopes" values which are monodimensional and not bidimensional

plt.contour(X, Y, Z, colors='black', linewidths=.1)



<matplotlib.contour.QuadContourSet at 0x7f80746be2e0>

In [108]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="darkgrid")
iris = sns.load_dataset("iris")

# Set up the figure
f, ax = plt.subplots(figsize=(8, 8))
ax.set_aspect("equal")

# Draw a contour plot to represent each bivariate density
sns.kdeplot(
    # data=iris.query("species != 'versicolor'"),
    x=x,
    y=y,
    # hue="species",
    thresh=.1,
)

<AxesSubplot:>