# Network Analysis on the Embeddings of Sentences Generatedy by GEM
The analyses in this notebook use the results generated by the analyses on SPARK in the notebook sent-subsent-embs-graph-analysis-SPARK.ipynb


In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

%matplotlib inline

## Set a Timer

In [None]:
# For timing
from contextlib import contextmanager
from timeit import default_timer
import time 

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start

# Load in the Embeddings of the Sentences Generated by GEM

In [None]:
import pickle

In [None]:
import torch
embs = torch.load('NYT_train_gem_embs.pt')

In [None]:
embs.shape

# Compute the Pair-Wise Distance

In [None]:
from scipy.spatial.distance import pdist, squareform

In [None]:
# Test
with elapsed_timer() as elapsed:
    embs_pdist = pdist(embs[:5])
    duration = '%.1f' % elapsed()
print(" Computing the distances between {} points takes {} seconds".format(embs.shape[0], duration))  

In [None]:
with elapsed_timer() as elapsed:
    embs_pdist = pdist(embs)
    duration = '%.1f' % elapsed()
print(" Computing the distances between {} points takes {} seconds".format(embs.shape[0], duration))  

In [None]:
#np.save('NYT_train_gem_embs_pdist.np', embs_pdist)

In [None]:
###############################

# Compute Histogram of the pdist

In [None]:
#embs_pdist = np.load('NYT_train_gem_embs_pdist.np.npy')

In [None]:
embs_pdist.shape

In [None]:
np.histogram(embs_pdist, bins=20, density=False)

In [None]:
count = [    253866,   99766893, 2238693005, 3009138784,  758713639,
          96808108,   19175032,    3387572,    1163346,     663750,
            237940,       3338,        208,       8185,     323331,
              3185,         45,         16,          0,          2]
bin = [  0.        ,  11.648274  ,  23.296548  ,  34.944822  ,
         46.593096  ,  58.24137   ,  69.889644  ,  81.537918  ,
         93.18619199, 104.83446599, 116.48273999, 128.13101399,
        139.77928799, 151.42756199, 163.07583599, 174.72410999,
        186.37238399, 198.02065799, 209.66893199, 221.31720599,
        232.96547999]

In [None]:
import math
midpoints = []
for i in range(len(bin)-1):
    midpoints.append(math.floor((bin[i+1]-bin[i]) //2 + bin[i]))

In [None]:
midpoints

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(midpoints, count, width=10)
plt.show()

In [None]:
plt.figure(figsize=(15,9))
plt.bar(midpoints, count, width=10)
plt.xticks(midpoints)
plt.xlabel('Euclidean Distance')
plt.ylabel('Counts (10^10)')
plt.savefig("GEM_embs_pdist_hist.pdf")

In [None]:
hist_df = pd.DataFrame({'count':count, 'bin_right':bin[1:]})
hist_df.head()

In [None]:
hist_df.to_csv('GEM_embs_pdist_hist.csv', index=False)

# Compute Mean, Max, Min

In [None]:
embs_pdist.mean()

In [None]:
embs_pdist.max()

In [None]:
embs_pdist.min()

# Compute a Graph by dist < mean

In [None]:
m = 111610
(m * m) /2 - m /2 # 6228340245

In [None]:
i = 111608
j = 111609
m * i + j - ((i+2)*(i+1))//2

In [None]:
embs_pdist[m * i + j - ((i+2)*(i+1))//2]

In [None]:
i = 0
j = 0

In [None]:
embs_pdist_test = embs_pdist[20:50]

In [None]:
np.where(embs_pdist_test < 39)

In [None]:
np.triu_indices(4, 1)

In [None]:
np.triu_indices(4)

In [None]:
# from upper triangular index back to matrix indices
from math import floor, sqrt, ceil
def coor_to_idx(n, i, j):
    return i*(2*n-i+1)//2+j-i
def idx_to_coor(n, k):
    i = floor((-sqrt((2*n+1)*(2*n+1)-8*k)+2*n+1)/2)
    j = k + i - i*(2*n-i+1)//2
    return i, j

def coor_to_idx_shift(n, i, j):
    return n * i + j - ((i + 1) * (i + 2)) //2
    
def idx_to_coor_shift(n, k):
    relk = k
    
    i = floor((-sqrt((2*n+1)*(2*n+1)-8*k)+2*n+1)/2)

    #translate row number shifted for diagnol
    k = k + i + 1
    
    for p in range(k, n * n):
        i = floor((-sqrt((2 * n + 1) * (2 * n + 1) - 8 * p) + 2 * n + 1) / 2)
        if(relk == p - i - 1):
            j = p + i - i*(2*n-i+1)//2
            return i, j
    
    return -1, -1

In [None]:
embs_graph_edges = np.nonzero(embs_pdist_test < 39)

In [None]:
def idx_to_coor_vec(k):
    return idx_to_coor_shift(111610, k)

In [None]:
idx_vec = np.vectorize(idx_to_coor_vec)

In [None]:
edges = idx_vec(embs_graph_edges[0])

In [None]:
np.save('GEM_graph_edges_end1.npy', edges[0])

In [None]:
np.save('GEM_graph_edges_end2.npy', edges[1])

In [None]:
np.load('GEM_graph_edges_end2.npy')

In [None]:
idx_vec(embs_graph_edges[0])

In [None]:
idx_vec(embs_graph_edges[0])

In [None]:
idx_vec([6228222014, 6228340244])

In [None]:
embs_graph_edges = np.nonzero(embs_pdist < 23)

In [None]:
embs_graph_edges[0].shape

In [None]:
np.save("GEM_embs_graph_edges_dist_less_23.npy", embs_graph_edges[0])

In [None]:
edges = idx_vec(embs_graph_edges[0])

In [None]:
np.save('GEM_graph_edges_end1.npy', edges[0])

In [None]:
np.save('GEM_graph_edges_end2.npy', edges[1])

# Make Graph Edge List

In [None]:
end1 = np.load('GEM_graph_edges_end1.npy')

In [None]:
end1.shape

In [None]:
end2 = np.load('GEM_graph_edges_end2.npy')

In [None]:
end2.shape

In [None]:
end2.max()

In [None]:
all_edges_df = pd.DataFrame({'end1': end1, 'end2':end2})

In [None]:
all_edges_df.head()

In [None]:
all_edges_df.to_csv("GEM_all_edges.csv", header=False, index=False)

In [None]:
all_edges_df.shape

# networkX and iGraph Analysis

In [None]:
all_edges_df = pd.read_csv("GEM_all_edges.csv")

In [None]:
all_edges_df.shape

In [None]:
# save all edges as nodeId nodeId
all_edges_df.to_csv("GEM_all_edges_space.csv", sep=" ", header=False, index=False)

In [None]:
edges = all_edges_df[:1000]

In [None]:
edges.to_csv('edges_test.list', header=False, index=False)

In [None]:
edges.head()

In [None]:
import networkx as nx

In [None]:
with open("edges_test.list", "rb") as fh:
    G = nx.read_edgelist(fh, delimiter=",")

In [None]:
G.number_of_nodes(), G.number_of_edges()

In [None]:
len(list(nx.connected_components(G)))

In [None]:
plt.figure(figsize=(15,12))
nx.draw(G)

In [None]:
nx.average_clustering(G)

In [None]:
import itertools as it

In [None]:
# Size of the largest connected components
len(max(nx.connected_components(G), key=len))

In [None]:
with open("GEM_all_edges_space.csv", "rb") as fh:
    Gall = nx.read_edgelist(fh, delimiter=" ")

In [None]:
Gall.number_of_nodes(), Gall.number_of_edges()

In [None]:
nx.number_connected_components(Gall)

In [None]:
# Size of the largest connected components
len(max(nx.connected_components(Gall), key=len))

# Degree and Size Analysis

In [None]:
import pandas as pd
degrees = pd.read_csv("degrees/degrees.csv/degrees.csv")

In [None]:
degrees.head()

In [None]:
import numpy as np

In [None]:
degree_cuts = pd.cut(degrees['degree'], bins=24, labels=np.arange(1, 25))

In [None]:
degree_cuts.value_counts()

In [None]:
train_df_relgrp

In [None]:
degrees['degree'].unique()

In [None]:
degrees[degrees.degree > 100]

In [None]:
import seaborn as sns

In [None]:
type(degrees.degree[0])

In [None]:
sns.histplot(degrees, x='degree')

In [None]:
import matplotlib
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 14}

matplotlib.rc('font', **font)

In [None]:
ax = plt.gca()
degrees.degree.hist(figsize=(12,6))
ax.grid(False)
ax.set_xlabel("Degree")
ax.set_ylabel("Number of Vertices")
ax.set_yticks(np.arange(0, 90000, 10000))
plt.savefig("figures/GEM_sentence_degrees.pdf")

In [None]:
degrees.groupby('degree').count().sort_values('degree', ascending=False)

In [None]:
degrees.groupby('degree').size().reset_index(name='counts').sort_values('degree', ascending=False)

In [None]:
degrees[degrees.degree==25627]

In [None]:
degrees[degrees.degree >= 31620].shape

In [None]:
bdegrees = [35146, 34882, 34757, 34494, 34281]
id_bdegree = []
for deg in bdegrees:
    id_bdegree.append(degrees[degrees.degree == deg])

In [None]:
pd.concat(id_bdegree)

In [None]:
bdegrees = [35146, 34882, 34757, 34494, 34281]
degrees[degrees.degree.isin(bdegrees)]

# Load the NYT train Data

In [None]:
train_df = pd.read_csv("NYT_train_df_nn.csv", index_col=0)

In [None]:
train_df.columns

In [None]:
for idx in degrees[degrees.degree.isin(bdegrees)].id:
    print(train_df.iloc[idx].sentText, train_df.iloc[idx].relIdx)

In [None]:
train_df['relation'].unique(), train_df['relIdx'].unique()

## Relation Distribution

In [None]:
train_df.groupby('relIdx').size().reset_index(name='count').sort_values('count', ascending=False)

# Connected Components

In [None]:
cc = pd.read_csv('cc/connectedComponents.csv/cc.csv')

In [None]:
cc.head()

In [None]:
cc.shape

In [None]:
ccsizes = cc.groupby('component').size().reset_index(name='counts').sort_values('counts', ascending=False)

In [None]:
ccsizes[1:].hist('counts')

In [None]:
ccsizes.head()

In [None]:
sns.distplot(ccsizes[1:]['counts'])

In [None]:
# Largest cc
lgst_cc_ids = cc[cc.component == 5].id

In [None]:
lgst_cc_ids.shape

In [None]:
lgst_cc_ids.to_csv("Larget_Connected_Component_IDs.csv", index=False)

In [None]:
all_edges = pd.read_csv('GEM_all_edges.csv')

In [None]:
all_edges.columns = ['end1', 'end2']
all_edges.head()

In [None]:
all_edges.end2.nunique()

In [None]:
len(set(all_edges.end1).union(set(all_edges.end2)))

In [None]:
all_edges.shape

In [None]:
allcc_edges = all_edges[(all_edges['end1'].isin(lgst_cc_ids))]

In [None]:
allcc_edges.shape

In [None]:
allcc_edges.end1.nunique()

In [None]:
allcc_edges = all_edges[(all_edges['end1'].isin(lgst_cc_ids)) | (all_edges['end2'].isin(lgst_cc_ids))]

In [None]:
allcc_edges.shape

In [None]:
len(set(allcc_edges.end1).union(set(allcc_edges.end2)))

In [None]:
allcc_edges.end2.nunique()

In [None]:
allcc_edges.to_csv("GEM_largest_CC_edges.csv", sep=' ', header=False, index=False)

# Numbers Edges in Each CC

In [None]:
cc.component.nunique(), cc.shape

In [None]:
cc_num = cc.component.unique()

In [None]:
cc_num

In [None]:
%%time
#cc_nums = []
#cc_sizes = []
#i = 1
#for n in cc_num:
#    print(i)
#    i = i + 1
#    cc_nums.append(n)
#    cc_ids = cc[cc.component == n].id
#    cc_edges = all_edges[(all_edges['end1'].isin(cc_ids)) | (all_edges['end2'].isin(cc_ids))]
#    cc_sizes.append(cc_edges.shape[0])

In [None]:
len(cc_nums),len(cc_sizes)

In [None]:
cc_num_sizes_df = pd.DataFrame({'cc':cc_num, 'size':cc_sizes})

In [None]:
cc_num_sizes_df.shape

In [None]:
cc_num_sizes_df.to_csv('GEM_graph_cc_sizes.csv', index=False)

In [None]:
cc_size_counts = cc_num_sizes_df.groupby('size').size().reset_index(name='counts').sort_values('counts', ascending=False)

In [None]:
cc_size_counts.shape

In [None]:
plt.bar(cc_size_counts[:-1]['size'], cc_size_counts[:-1].counts, width=20)

# Largest Connected Component Degrees

In [None]:
lccdegrees = pd.read_csv("largest_cc_degrees.csv")

In [None]:
lccdegrees.head()

In [None]:
lccdegrees.groupby('degree').size().reset_index(name='counts').sort_values('degree', ascending=False)

In [None]:
lccdegrees[lccdegrees.degree > 10000]

# Page Rank Analysis

In [None]:
ranks = pd.read_csv('largest_cc_vertices_pageRank.csv')

In [None]:
ranks.shape

In [None]:
ranks.head()

In [None]:
ranks.pagerank.min(), ranks.pagerank.max(), ranks.pagerank.mean()

In [None]:
cuts = pd.cut(ranks.pagerank, 20)

In [None]:
cuts.value_counts()

In [None]:
ranks[ranks.pagerank > 1550]

In [None]:
train_df.iloc[111605]

In [None]:
lccdegrees[lccdegrees.id == 111605]

# Shortest Paths

In [None]:
import json

In [None]:
with open('point1percent.json', 'r') as f:
    p11 = json.loads(f.readline())

In [None]:
for it in p11['distances'].keys():
    print(p11['id'], it, p11['distances'][it])

In [None]:
p11['distances']

In [None]:
p11

In [None]:
files = ['point1percent.json', 'point2percent.json', 'point3percent.json', 'point4percent.json', 'point5percent.json', \
        'point6percent.json', 'point7percent.json', 'point8percent.json', 'point9percent.json', 'point10percent.json']

In [None]:
src = []
tgt = []
dist = []
for file in files:
    with open(file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            p = json.loads(line)
            for it in p['distances']:
                src.append(p['id'])
                tgt.append(it)
                dist.append(p['distances'][it])

In [None]:
dists = pd.DataFrame({'src':src, 'tgt':tgt, 'dist':dist})

In [None]:
dists.shape

In [None]:
dists = pd.read_csv('shortestpaths_estimates_point1percent.csv')

In [None]:
dists.head()

In [None]:
dists.groupby('dist').size() / 1178431

In [None]:
dists.groupby('dist').size()

In [None]:
dists['dist'].mean()

# Largest Degrees in Largest Connected Components

In [None]:
lccdegrees.groupby(['degree', 'id']).size().reset_index(name='counts').sort_values('degree', ascending=False)

In [None]:
lccbdegrees = [35146, 34882, 34757, 34494]

In [None]:
lccdegrees[(lccdegrees.degree == 35146) | (lccdegrees.degree == 34882)| (lccdegrees.degree == 34757) | (lccdegrees.degree == 34494) ]

In [None]:
lccdegrees

In [None]:
allcc_edges.columns

In [None]:
allcc_edges.shape

In [None]:
allcc_edges.head()

In [None]:
allcc_edges[(allcc_edges.end1 == 44557) & (allcc_edges.end2 == 5508)]

In [None]:
allcc_edges[(allcc_edges.end2 == 44557) & (allcc_edges.end1 == 5508)]

In [None]:
edges_44557 = allcc_edges[(allcc_edges.end1 == 44557) | (allcc_edges.end2 == 44557)]

In [None]:
edges_44557.shape

In [None]:
edges_44557_20 = edges_44557.sample(20)

In [None]:
edges_44557_20.end1.unique(), edges_44557_20.end2.unique()

In [None]:
edges_44557_nbs = [44557, 5508,  7998, 44204,  2766,  5427,  4603, 37639,  1724,\
                   109381,  49108,  73918,  82284,  71196,  56459,  68281, \
                   51405,  62678, 110805,  84617,  54303, 106143]

In [None]:
graph_44557 = allcc_edges[(allcc_edges.end1.isin(edges_44557_nbs)) & (allcc_edges.end2.isin(edges_44557_nbs))]

In [None]:
graph_44557.shape

In [None]:
graph_44557 = allcc_edges[(allcc_edges.end2.isin(edges_44557_nbs)) & (allcc_edges.end1.isin(edges_44557_nbs))]

In [None]:
graph_44557.shape

In [None]:
graph_44557.to_csv('graph_44557.csv', header=False, index=False)

In [None]:
with open("graph_44557.csv", "rb") as fh:
    G = nx.read_edgelist(fh, delimiter=",")

In [None]:
#plt.figure(figsize=(15,12))
nx.draw(G, with_labels=True)

In [None]:
train_df.iloc[edges_44557_nbs[4]].relation

In [None]:
import re
pat = r"/.*/"

vid_vlabel = {}
for idx in edges_44557_nbs:
    #print(id)
    vid_vlabel[str(idx)] = re.sub(pat, "", train_df.iloc[idx].relation)

In [None]:
vid_vlabel

In [None]:
Glabel = nx.relabel_nodes(G, vid_vlabel)

In [None]:
Glabel.nodes

In [None]:
#nx.write_gml(G, 'graph_44557.gml')

In [None]:
G = nx.load

In [None]:
G.nodes['5508']['name'] = 'contains'

In [None]:
for idx in edges_44557_nbs:
    G.nodes[str(idx)]['name'] = vid_vlabel[str(idx)]

In [None]:
G.nodes['2766']

In [None]:
nx.draw(G, with_labels=True)

In [None]:
edges_5508 = allcc_edges[(allcc_edges.end1 == 5508) | (allcc_edges.end2 == 5508)]

In [None]:
edges_5508.shape

In [None]:
edges_5508_20 = edges_5508.sample(20)

In [None]:
edges_5508_20.end1.unique(), edges_5508_20.end2.unique()

In [None]:
edges_5508_nbs = [5508, 77857,  58389,  93219,  27899,   9989,  83892, 111080,  78963, \
         64850,  54399,  65131,  98035,  36134,  81407,  37868,  75605, \
         40146,  84165,  44570,  57772]

In [None]:
graph_5508 = allcc_edges[(allcc_edges.end1.isin(edges_5508_nbs)) & (allcc_edges.end2.isin(edges_5508_nbs))]

In [None]:
graph_5508.shape

In [None]:
graph_5508 = allcc_edges[(allcc_edges.end2.isin(edges_5508_nbs)) & (allcc_edges.end1.isin(edges_5508_nbs))]

In [None]:
graph_5508.shape

In [None]:
graph_5508.to_csv('graph_5508.csv', header=False, index=False)

In [None]:
# merge graph_44557 and graph_5508
graph_44557_5508 = pd.concat([graph_44557, graph_5508])

In [None]:
graph_44557_5508.to_csv('graph_44557_5508.csv', header=False, index=False)

In [None]:
with open("graph_5508.csv", "rb") as fh:
    G5508 = nx.read_edgelist(fh, delimiter=",")

In [None]:
#plt.figure(figsize=(15,12))
nx.draw(G5508, with_labels=True)

In [None]:
train_df.iloc[edges_5508_nbs[4]].relation

In [None]:
import re
pat = r"/.*/"

vid_vlabel = {}
for idx in edges_5508_nbs:
    #print(id)
    vid_vlabel[str(idx)] = re.sub(pat, "", train_df.iloc[idx].relation)

In [None]:
vid_vlabel

In [None]:
#nx.write_gml(G, 'graph_44557.gml')

In [None]:
G = nx.load

In [None]:
G.nodes['5508']['name'] = 'contains'

In [None]:
for idx in edges_44557_nbs:
    G.nodes[str(idx)]['name'] = vid_vlabel[str(idx)]

In [None]:
G.nodes['2766']

In [None]:
with open('graph_44557_5508.csv', 'rb') as fh:
    Gboth = nx.read_edgelist(fh, delimiter=",")

In [None]:
Gboth.nodes

In [None]:
Gboth.number_of_edges()

In [None]:
edges_both = []
for i in [edges_44557_nbs, edges_5508_nbs]:
    for j in i:
        if(j not in edges_both):
            edges_both.append(j)

In [None]:
import re
pat = r"/.*/"

vid_vlabel = {}
for idx in edges_both:
    #print(id)
    vid_vlabel[str(idx)] = re.sub(pat, "", train_df.iloc[idx].relation)

In [None]:
for idx in edges_both:
    Gboth.nodes[str(idx)]['name'] = vid_vlabel[str(idx)]

In [None]:
nx.write_gml(Gboth, 'graph_44557_5508.gml')

In [None]:
gg = nx.read_gml('graph_44557_5508.gml')

In [None]:
gg.nodes

In [None]:
train_df.iloc[44557]

In [None]:
train_df.iloc[5508]

# Relation Distribution in the LCC

In [None]:
lgst_cc_ids.shape

In [None]:
lgst_cc_ids.head()

In [None]:
train_df_lcc = train_df.iloc[lgst_cc_ids]

In [None]:
train_df_lcc.shape

In [None]:
train_df_lcc.head()

In [None]:
train_df_relgrp = train_df.groupby('relIdx').size().reset_index(name='count').sort_values('count', ascending=False)

In [None]:
train_df_lcc_relgrp = train_df_lcc.groupby('relIdx').size().reset_index(name='count').sort_values('count', ascending=False)

In [None]:
plt.figure(figsize=(15, 12))
plt.pie(train_df_relgrp['count'] / train_df.shape[0], labels=train_df_relgrp.relIdx)

In [None]:
plt.figure(figsize=(15, 12))
plt.pie(train_df_lcc_relgrp['count'] / train_df_lcc.shape[0], labels=train_df_lcc_relgrp.relIdx)

In [None]:
train_df_relgrp

In [None]:
train_df_lcc_relgrp

# Fit Degree Power Law

In [None]:
%pip install powerlaw

In [None]:
import numpy as np
import powerlaw
import scipy
from scipy import stats
import networkx as nx

def fit_x(x):
    fit = powerlaw.Fit(np.array(x) + 1, xmin=1, discrete=True)
    alpha = fit.power_law.alpha
    xmin  = fit.power_law.xmin
    print('powerlaw', scipy.stats.kstest(x, "powerlaw", args=(alpha, xmin), N=len(x)))
    print('lognorm', scipy.stats.kstest(x, "lognorm", args=(np.mean(x), np.std(x)), N=len(x)))

In [None]:
fit_x(degrees.degree)

In [None]:
fit = powerlaw.Fit(np.array(degrees.degree)+1,xmin=1,discrete=True)

fit.power_law.plot_pdf( color= 'b',linestyle='--',label='fit ccdf')
fit.plot_pdf( color= 'b')

print('alpha= ',fit.power_law.alpha,'  sigma= ',fit.power_law.sigma)

In [None]:
fit.xmin

In [None]:
fit.distribution_compare('power_law', 'lognormal')

In [None]:
degrees.degree.min(), degrees.degree.max()

In [None]:
bins = [0, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000, 20000, 22000, 24000, 26000,\
       28000, 30000, 32000, 34000, 36000, 38000]
sum_cuts = pd.cut(degrees.degree, bins)
sum_counts = sum_cuts.value_counts()

In [None]:
from scipy.optimize import leastsq

In [None]:
powerlaw = lambda x, amp, coef, index: amp * (x**index) + coef
x = np.arange(2000, 40000, 2000)
y = np.array(sum_counts)

In [None]:
fitfunc = lambda p, x: p[0] * (x**p[1]) + p[2]
errfunc =  lambda p, x, y: (y - fitfunc(p, x))
pinit = [1, -1, 0.]
out = leastsq(errfunc, pinit, args = (x, y), full_output = 1)

In [None]:
pfinal = out[0]
print(pfinal[0])
print(pfinal[1])
print(pfinal[2])

In [None]:
import matplotlib.pyplot as plt

In [None]:
amp =  pfinal[0]
index = pfinal[1]
coef = pfinal[2]

plt.figure(figsize = (15, 9))
plt.plot(x, powerlaw(x, amp, coef, index), c = 'b', label = "Fit Power Law")
plt.scatter(x, y, label = "Raw Counts")
plt.text(22000, 20000, "$fit = %5.2fx^{%5.2f} + (%5.2f$)" % (amp, index, coef), color = 'b')
plt.xticks(np.arange(0, 44000, 2000), rotation=30)
plt.title("Sentence Embedding Similarity Graph Degree Best Fit Power Law")
plt.ylabel("Number of Degrees")
plt.xlabel("Degrees")
plt.grid(False)
plt.legend(loc = 'best')