In [1]:
from tqdm.notebook import tqdm
import visualize as vis
import utils as ut
import warnings
warnings.filterwarnings('ignore')

## TSNE Dimensional Reduction of Documents


### Patents

In [2]:
# Select model
n_topics = 20
docs =f'results/patents/lda_mallet_model_{n_topics}/mallet_output/doc-topics.txt'
tsne_input = ut.read_doc_topics(docs)
tsne = ut.tsne_model(tsne_input)

# Open dataframe
patents = ut.unpickler('app_data/data/patents_dataframe.pkl')

# Join both dataframes
patents.drop_duplicates('Patent ID', inplace=True)
patents = patents.set_index('Patent ID',  drop=False)
joined_df = tsne.join(patents)
ut.pickler('app_data/data/patents_tsne.pkl', joined_df)

# Create hover text
HOVER_TEXT = vis.create_hovertexts(joined_df, 'patents')

# Visualize
fig1 = vis.display_documents_tsme(joined_df,n_topics,HOVER_TEXT)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1961 samples in 0.000s...
[t-SNE] Computed neighbors for 1961 samples in 0.151s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1961
[t-SNE] Computed conditional probabilities for sample 1961 / 1961
[t-SNE] Mean sigma: 0.099918
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.508530
[t-SNE] KL divergence after 1000 iterations: 0.689350


In [3]:
fig1.write_image('img/tsne_patents.png', format='png',engine='kaleido')

### Cordis

In [4]:
# Select model
n_topics = 15
docs =f'results/cordis/lda_mallet_model_{n_topics}/mallet_output/doc-topics.txt'
tsne_input = ut.read_doc_topics(docs)
tsne = ut.tsne_model(tsne_input)

# Open dataframe
cordis = ut.unpickler('app_data/data/cordis_dataframe.pkl')

# Join both dataframes
cordis.drop_duplicates('Project ID', inplace=True)
cordis = cordis.set_index('Project ID',  drop=False)
cordis['Year'] = cordis.apply(lambda row: str(row['Start Year']).split('-')[0], axis=1)

joined_df = tsne.join(cordis)
ut.pickler('app_data/data/cordis_tsne.pkl', joined_df)

# Create hover text
HOVER_TEXT = vis.create_hovertexts(joined_df, 'cordis')

# Visualize
fig2 = vis.display_documents_tsme(joined_df,n_topics,HOVER_TEXT)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 19838 samples in 0.021s...
[t-SNE] Computed neighbors for 19838 samples in 8.143s...
[t-SNE] Computed conditional probabilities for sample 1000 / 19838
[t-SNE] Computed conditional probabilities for sample 2000 / 19838
[t-SNE] Computed conditional probabilities for sample 3000 / 19838
[t-SNE] Computed conditional probabilities for sample 4000 / 19838
[t-SNE] Computed conditional probabilities for sample 5000 / 19838
[t-SNE] Computed conditional probabilities for sample 6000 / 19838
[t-SNE] Computed conditional probabilities for sample 7000 / 19838
[t-SNE] Computed conditional probabilities for sample 8000 / 19838
[t-SNE] Computed conditional probabilities for sample 9000 / 19838
[t-SNE] Computed conditional probabilities for sample 10000 / 19838
[t-SNE] Computed conditional probabilities for sample 11000 / 19838
[t-SNE] Computed conditional probabilities for sample 12000 / 19838
[t-SNE] Computed conditional probabilities for sam

In [5]:
fig2.write_image('img/tsne_cordis.png', format='png',engine='kaleido')

### Semantic Scholar

In [6]:
# Select model
n_topics = 20
'doc-topics.txt'
docs =f'results/semantic_scholar/lda_mallet_model_{n_topics}/mallet_output/doc-topics.txt'
tsne_input = ut.read_doc_topics(docs)
tsne = ut.tsne_model(tsne_input)

# Open dataframe
semantic_scholar = ut.unpickler('app_data/data/semantic_scholar_dataframe.pkl')

# Join both dataframes
semantic_scholar.drop_duplicates('Publication ID', inplace=True)
semantic_scholar = semantic_scholar.set_index('Publication ID',  drop=False)
joined_df = tsne.join(semantic_scholar)
ut.pickler('app_data/data/semantic_scholar_tsne.pkl', joined_df)

# Create hover text
HOVER_TEXT = vis.create_hovertexts(joined_df, 'semantic_scholar')

# Visualize
fig3 = vis.display_documents_tsme(joined_df,n_topics,HOVER_TEXT)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 306844 samples in 0.019s...
[t-SNE] Computed neighbors for 306844 samples in 226.706s...
[t-SNE] Computed conditional probabilities for sample 1000 / 306844
[t-SNE] Computed conditional probabilities for sample 2000 / 306844
[t-SNE] Computed conditional probabilities for sample 3000 / 306844
[t-SNE] Computed conditional probabilities for sample 4000 / 306844
[t-SNE] Computed conditional probabilities for sample 5000 / 306844
[t-SNE] Computed conditional probabilities for sample 6000 / 306844
[t-SNE] Computed conditional probabilities for sample 7000 / 306844
[t-SNE] Computed conditional probabilities for sample 8000 / 306844
[t-SNE] Computed conditional probabilities for sample 9000 / 306844
[t-SNE] Computed conditional probabilities for sample 10000 / 306844
[t-SNE] Computed conditional probabilities for sample 11000 / 306844
[t-SNE] Computed conditional probabilities for sample 12000 / 306844
[t-SNE] Computed conditional proba

In [None]:
fig3.write_image('img/tsne_semantic_scholar.png', format='png',engine='kaleido')

