In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
DATASET = 'DatasetsWtime'
ARTICLE_CORE = f'{DATASET}/article_core.csv'

In [None]:
core = pd.read_csv(f'{ARTICLE_CORE}')

In [None]:
df_embeddings = pd.read_csv('Embeddings/embeddings.csv')
df_embeddings

In [None]:
def convert_to_nd_array(sequence):
    sequence = sequence.replace('[','')
    sequence = sequence.replace(']','')
    sequence = sequence.replace(' ','')
    sequence = sequence.replace('\n','')
    sequence= sequence.split(',')
    float_li = [float(i) for i in sequence]
    return np.array(float_li)

In [None]:
df_embeddings['embedding'] = df_embeddings['embedding'].apply(lambda x: convert_to_nd_array(x))

In [None]:
exclude_sections = ['Bedriftsroboten','Bolig','Boligsalg','Direkte','Hyttesalg']
include = list(set(df_embeddings.section)-set(exclude_sections))
df_embeddings = df_embeddings[df_embeddings['section'].isin(include)]

In [None]:
embeddings = np.array([i for i in df_embeddings['embedding']])

In [None]:
tsne = TSNE(n_components=2,perplexity=len(df_embeddings.article_id)/100,learning_rate=len(df_embeddings.article_id)/12,early_exaggeration=1)
embeddings_tsne = tsne.fit_transform(embeddings)

In [None]:
colors_by_section= {'A-Magasinet': (0.8087954113106306, 0.5634700050056693, 0.19502642696727285),
 'BT Magasinet': (0.9656056642634557, 0.4245907603266889, 0.6579786740552919),
 'Bedriftsroboten': (0.22335772267769388,
  0.6565792317435265,
  0.8171355503265633),
 'Bil': (0.4768773964929644, 0.5974418160509446, 0.9584992622400258),
 'Bolig': (0.49382662140640926, 0.6649121332643736, 0.19300804648700284),
 'Boligsalg': (0.6423044349219739, 0.5497680051256467, 0.9582651433656727),
 'Debatt': (0.5920891529639701, 0.6418467016378244, 0.1935069134991043),
 'Digital': (0.9699521567340649, 0.4569882390259858, 0.36385324448493633),
 'Familie og oppvekst': (0.20703735729643508,
  0.6824290013722435,
  0.5885318893529169),
 'Fotball': (0.9633321742064956, 0.40643825645731757, 0.7592537599568671),
 'Hyttesalg': (0.21044753832183283, 0.6773105080456748, 0.6433941168468681),
 'Innenriks': (0.21786710662428366, 0.6656671601322255, 0.7482809385065813),
 'Karriere': (0.9082572436765556, 0.40195790729656516, 0.9576909250290225),
 'Kommentar': (0.20312757197899856, 0.6881249249803418, 0.5177618167447304),
 'Kultur': (0.3126890019504329, 0.6928754610296064, 0.1923704830330379),
 'Leder': (0.7350228985632719, 0.5952719904750953, 0.1944419133847522),
 'Lokalt': (0.19783576093349015, 0.6955516966063037, 0.3995301037444499),
 'Minneord': (0.6666319352625271, 0.6197366714155128, 0.19396267878823373),
 'Nyheter': (0.9603888539940703, 0.3814317878772117, 0.8683117650835491),
 'Reise': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701),
 'Sport': (0.903599057664843, 0.511987276335809, 0.19588350060161624),
 'Sprek': (0.21387918628643265, 0.6720135434784761, 0.693961140878689),
 'Utenriks': (0.23299120924703914, 0.639586552066035, 0.9260706093977744),
 'Økonomi': (0.774710828527837, 0.49133823414365724, 0.9580114121137316)}

In [None]:
marker_by_section= {'A-Magasinet': 'v',
 'BT Magasinet': '*',
 'Bedriftsroboten': 'v',
 'Bil': '.',
 'Bolig': 'x',
 'Boligsalg': 'v',
 'Debatt': 'v',
 'Digital': 'x',
 'Familie og oppvekst': 'x',
 'Fotball': '.',
 'Hyttesalg': 'v',
 'Innenriks': 'x',
 'Karriere': 'v',
 'Kommentar': '*',
 'Kultur': '+',
 'Leder': '+',
 'Lokalt': '.',
 'Minneord': '^',
 'Nyheter': '.',
 'Reise': 'v',
 'Sport': '+',
 'Sprek': 'v',
 'Utenriks': '*',
 'Økonomi': '+'}

In [None]:
for key in exclude_sections:
    if key in colors_by_section:
        del colors_by_section[key]
    if key in marker_by_section:
        del marker_by_section[key]

In [None]:
new_emb_df = pd.DataFrame()
new_emb_df['article_id'] = df_embeddings['article_id']
new_emb_df['title'] = df_embeddings['title']
new_emb_df['section'] = df_embeddings['section']
#new_emb_df['keyword'] = df_embeddings['keyword']
new_emb_df['x'] = embeddings_tsne[:,0]
new_emb_df['y'] = embeddings_tsne[:,1]
new_emb_df['color'] = new_emb_df['section'].map(colors_by_section)
new_emb_df['marker'] = new_emb_df['section'].map(marker_by_section)

In [None]:
new_emb_df

In [None]:
from matplotlib.lines import Line2D
x_vals = list(new_emb_df['x'])
y_vals = list(new_emb_df['y'])
color_sequence = list(new_emb_df['color'])
marker_vals = list(new_emb_df['marker'])
plt.figure(figsize=(25, 15))

marker_and_color = []
section_name = []
for name,rgb_c in colors_by_section.items():
    marker_and_color.append(Line2D([], [], marker=marker_by_section.get(name), markersize=10, color=rgb_c, linestyle='None'))
    section_name.append(name)

for i in range(len(x_vals)):
    plt.scatter(x_vals[i],y_vals[i],color=color_sequence[i],marker=marker_vals[i])

plt.legend(marker_and_color, section_name, loc='lower left', fontsize=13)
#plt.savefig('../Figures_tSNE/tsne_filtered_new_fs_13.png', format='png', dpi=200,bbox_inches='tight')
plt.show()