In [53]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfTransformer

In [54]:
#df = pd.read_pickle('lemmatized.pkl')
df = pd.read_pickle('nltk_stemmed.pkl')

In [55]:
# Binarize text stored in list
mlb = MultiLabelBinarizer()
expandedLabelData = mlb.fit_transform(df['Title_Processed'])

In [56]:
#Tfi df encoding
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(expandedLabelData)

In [86]:
#NMF topic modeling
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 50
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(X_train_tfidf.T)

#nmf = LatentDirichletAllocation(n_components=no_topics, random_state=100).fit(X_train_tfidf.T)

(50, 1945)

In [85]:
#Dimensionality reduction
from sklearn.manifold import TSNE
nmf_embedded = TSNE(n_components=3).fit_transform(nmf.components_.T)

(1945, 3)

In [None]:
#join all data back together
reduced = pd.DataFrame(nmf_embedded)
ogdf = pd.read_excel('Night Shift Work and Light at Night_ Human cancer and biomonitoring studies (2018)-refs.xlsx')
jdf = reduced.join(ogdf)

In [70]:
#Get top words
vocab = mlb.classes_.tolist()
n_top_words = 5
topic_words = {}

for topic, comp in enumerate(nmf.components_):
    word_idx = np.argsort(comp)[::-1][:n_top_words]
    topic_words[topic] = [vocab[i] for i in word_idx]
    
tw = pd.DataFrame.from_dict(topic_words, orient = 'index')
tw = tw + ', '
twc = pd.DataFrame()
twc['top words'] = tw[tw.columns[1:]].apply(
    lambda x: ''.join(x.dropna().astype(str)),
    axis=1
)
twc

Unnamed: 0,top words
0,"pituitary-adrenocort, 1960-1997, phenotyp, 24.0,"
1,"e3n, balanc, aid, chick,"
2,"2007, poor, 1956-68, lutein,"
3,"major, final, evalu, 10-year,"
4,"chondroitin, acoust, dmba-induc, endogen,"
5,"lutein, diagram, perspect, hiv/aid,"
6,"load, firefight, from, actigraphi,"
7,"aid, head-down, axe, axi,"
8,"45-year-old, 1960-1997, domest, fatigu,"
9,"obstetr, 4-androstenedion, 2-week, 2013,"


In [89]:
#Match categories to original titles
topic_values = nmf.fit_transform(X_train_tfidf)
topic_values.shape
nmf.components_.shape
jdf['categories'] = topic_values.argmax(axis=1)
export = jdf.join(twc, on = 'categories')

(50, 2635)

In [81]:
export.to_pickle('clustered.pkl')

In [82]:
export

Unnamed: 0,0,1,2,HAWC ID,HERO ID,PubMed ID,Citation,Full Citation,Title,Authors,...,Inclusion|New|New not included,Inclusion|New|New not included|Shift work and health effects,Inclusion|New|Included in mech,Inclusion|SW or Light Qx study,Pending PDF review,Exclusion,Exclusion|Previously included,Exclusion|Foreign language,categories,top words
0,3.502392,-12.773194,12.490341,211657,,,Kantermann T et al. 2012,"Kantermann T et al. Noisy and individual, but ...","Noisy and individual, but doable: Shift-work r...",Kantermann T et al.,...,False,False,False,False,False,False,False,False,43,"adenosin, ddm-spain/var-ddm, diencephalon, mer..."
1,4.208245,-13.998688,-2.022749,216525,,3038708.0,Arduini D et al. 1987,Arduini D et al. Loss of circadian rhythms of ...,Loss of circadian rhythms of fetal behaviour i...,Arduini D et al.,...,False,False,False,False,False,True,False,False,1,"e3n, balanc, aid, chick,"
2,25.682995,-0.418867,1.076639,216820,,9507438.0,Arendt 1998,Arendt J. Biological rhythms: the science of c...,Biological rhythms: the science of chronobiology.,Arendt J,...,False,False,False,False,False,True,False,False,44,"prior, night-induc, apnoea, actigraphi,"
3,5.154023,5.574293,27.332537,216554,,3732539.0,"Armeanu, Frölich M, and Lequin 1986","Armeanu MC, Frölich M, and Lequin RM. Circadia...",Circadian rhythm of prolactin during the menst...,"Armeanu MC, Frölich M, and Lequin RM",...,False,False,False,False,False,True,False,False,28,"', body-temperatur, dim, capac,"
4,1.576665,-10.669566,4.064666,216214,,19606092.0,Aeschbach D et al. 2009,Aeschbach D et al. Use of transdermal melatoni...,Use of transdermal melatonin delivery to impro...,Aeschbach D et al.,...,False,False,False,False,False,True,False,False,19,"anaesthesiologist, agent, adjuv, domest,"
5,-10.672046,-1.849153,-8.001163,215884,,10188140.0,Akerstedt T et al. 1999,Akerstedt T et al. A 50-Hz electromagnetic fie...,A 50-Hz electromagnetic field impairs sleep.,Akerstedt T et al.,...,False,False,False,False,False,True,False,False,40,"4-androstenedion, dip, dissatisfi, dehydroepia..."
6,-12.387359,24.914564,-4.813682,216705,,7510912.0,"Akimoto S, Masai M, and Shimazaki J 1994","Akimoto S, Masai M, and Shimazaki J. Relations...",Relationship between diurnal rhythm of serum t...,"Akimoto S, Masai M, and Shimazaki J",...,False,False,False,False,False,True,False,False,24,"acromegali, 're, oc-2-kb, patholog,"
7,-9.110147,0.256828,13.384620,216202,,19241804.0,Anisimov 2008,"Anisimov VN. [Pineal gland, biorhythms and agi...","[Pineal gland, biorhythms and aging of an orga...",Anisimov VN,...,False,False,False,False,False,True,False,False,42,"coffe, end-spurt, contemporari, circadian-rhyt..."
8,4.968685,6.227458,5.002771,216751,,8473410.0,Apter D et al. 1993,Apter D et al. Gonadotropin-releasing hormone ...,Gonadotropin-releasing hormone pulse generator...,Apter D et al.,...,False,False,False,False,False,True,False,False,46,"better, lower, cutan, amino,"
9,3.364913,-14.859250,9.559494,216465,,26059855.0,Archer and Oster H 2015,Archer SN and Oster H. How sleep and wakefulne...,How sleep and wakefulness influence circadian ...,Archer SN and Oster H,...,False,False,False,False,False,True,False,False,43,"adenosin, ddm-spain/var-ddm, diencephalon, mer..."
