In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
papers=pd.read_csv('../input/nips-papers-1987-2019-updated/papers.csv')
#authors=pd.read_csv('authors.csv')

**Dataset**

In [None]:
papers.head()

In [None]:

train=papers.dropna().reset_index()
len(train)

In [None]:
train.head()

In [None]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
nlp = spacy.load('en_core_web_lg')
nlp.Defaults.stop_words |= {'dataset','datasets','propose','Dataset','Datasets','matrix'}
nlp.Defaults.stop_words -= {'data','Data'}
    

In [None]:
documents=train['abstract']
len(documents)

In [None]:
train=papers.dropna().reset_index()

full_docs=train['full_text']
len(full_docs)


In [None]:
full_docs_final=[]

from gensim.parsing.preprocessing import remove_stopwords ,strip_tags,strip_punctuation,strip_numeric,strip_multiple_whitespaces,strip_short,preprocess_string
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation,remove_stopwords,strip_numeric,strip_multiple_whitespaces,strip_short]
for i in range(len(full_docs)):
    full_docs_final.append(preprocess_string(full_docs[i], CUSTOM_FILTERS))

    


In [None]:
i=len(full_docs_final)
a=[]

for j in range(i):
    b=" "
    b=' '.join(full_docs_final[j])
    a.insert(j,b)

In [None]:
docs2=a
docs2=full_docs

Tokenization and stop word removal

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs2)):
    docs2[idx] = docs2[idx].lower()  # Convert to lowercase.
    docs2[idx] = tokenizer.tokenize(docs2[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs2 = [[token for token in doc if not token.isnumeric()] for doc in docs2]

# Remove words that are only one character.
docs2 = [[token for token in doc if len(token) > 1] for doc in docs2]


Building Corpus and Dictionary

In [None]:
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary2 = Dictionary(docs2)
dictionary2.save("filename")
dictionary2=Dictionary.load("filename")

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary2.filter_extremes(no_below=20, no_above=0.5)

In [None]:
corpus2 = [dictionary2.doc2bow(doc) for doc in docs2]

Training the LDA model

In [None]:
lda2=[]
from gensim.models import LdaModel


lda2 =LdaModel(
    corpus=corpus2,
    num_topics=10,
    iterations=50,
    id2word=dictionary2,
    passes=15,
    random_state=100,
    alpha=0.9,


    
    
)

lda2.print_topics()

Coherence Plot

In [None]:
from gensim.models import CoherenceModel

topic_coherence = []
for nb_topics in range(5,20):
    lda = LdaModel(corpus2, num_topics = nb_topics, id2word = dictionary2, passes=10)
    cohm = CoherenceModel(model=lda, corpus=corpus2, dictionary=dictionary2, coherence='u_mass')
    coh = cohm.get_coherence()
    topic_coherence.append(coh)

Plotting obtained Topics using wordcount()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.plot(range(5,20),topic_coherence)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score");

In [None]:
#import pyLDAvis
#import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()
#vis = pyLDAvis.gensim_models.prepare(lda2, corpus2, dictionary2)
#vis


#import pyLDAvis.sklearn
#pyLDAvis.enable_notebook() # To enable the visualization on the notebook
#panel = pyLDAvis.sklearn.prepare(lda2, corpus2,docs2, mds='tsne') # Create the panel for the visualizadocs2


In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] 

cloud = WordCloud(stopwords=nlp.Defaults.stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=16,
                )



In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(20,15))
axs = axs.flatten()

for i, t in enumerate(range(10)): 
    x = dict(lda2.show_topic(t, 30))
    cloud.generate_from_frequencies(x)
    axs[i].imshow(cloud)
    axs[i].axis('off')
    axs[i].set_title('Topic '+str(t+1))

plt.tight_layout()
plt.show() 

Obtainin Topic Distributions and Keywords from topic model

In [None]:
train=papers.dropna().reset_index()
def format_topics_sentences(ldamodel=lda2, corpus=corpus2, texts=train):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: 
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

final_df=format_topics_sentences(ldamodel=lda2, corpus=corpus2, texts=train['full_text'])

In [None]:
final_df.head(10)

In [None]:
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = final_df.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
(sent_topics_sorteddf_mallet)


**Gold standards extracted from** https://neurips.cc/Conferences/2020/PaperInformation/SubjectAreas

In [None]:
Algorithms= 'Active Learning, Bandit Algorithms, Boosting  Ensemble Methods, Classification, Clustering, Collaborative Filtering, Components Analysis, CCA, ICA, LDA, PCA, Density Estimation, Dynamical Systems, Hyperparameter Selection, Kernel Methods, Large Margin Methods, Metric Learning, Missing Data, Model Selection and Structure Learning, Multitask and Transfer Learning, Nonlinear Dimensionality Reduction  Manifold Learning, Online Learning, Ranking  Preference Learning, Regression, Reinforcement Learning, Relational Learning, Representation Learning, Semi-Supervised Learning, Similarity and Distance Learning, Sparse Coding  Dimensionality Expansion, Sparsity  Compressed Sensing, Spectral Methods, Sustainability, Stochastic Methods, Structured Prediction,  Unsupervised Learning'


Applications= 'Audio  Speech Processing, Computational Biology , Bioinformatics, Computational Social Science, Computer Vision, Denoising, Dialog Communication-Based Learning, Fairness Accountability , Transparency, Game Playing, Hardware  Systems, Image Segmentation, Information Retrieval, Matrix  Tensor Factorization, Motor Control, Music Modeling  Analysis, Natural Language Processing, Natural Scene Statistics, Network Analysis, Object Detection, Object Recognition, Privacy Anonymity  Security, Quantitative Finance  Econometrics, Recommender Systems, Robotics, Signal Processing, Source Separation, Speech Recognition, Systems Biology, Text Analysis, Time Series Analysis, Video, Motion  Tracking, Visual Features, Visual Perception, Visual Question Answering, Visual Scene Analysis  Interpretation,  Web Applications  Internet Data'


Data= 'Competitions Implementations Software Benchmarks Competitions  Challenges Data Sets  Data Repositories Software Toolkits'


Deep_Learning= 'Adversarial Networks, Attention Models, Biologically Plausible Deep Networks, Deep Autoencoders, Efficient Inference Methods, Efficient Training Methods, Embedding Approaches, Generative Models, Interaction-Based Deep Networks, Learning to Learn, Memory-Augmented Neural Networks, Neural Abstract Machines, One-Shot/Low-Shot Learning Approaches, Optimization  Deep Networks, Predictive Models, Program Induction, Recurrent Networks, Supervised Deep Networks, Virtual Environments,  Visualization Techniques  Deep Networks'


Neuroscience_Cognitive_Science= 'Auditory Perception  Modeling, Brain Imaging, Brain Mapping, Brain Segmentation, Brain--Computer Interfaces  Neural Prostheses, Cognitive Science, Connectomics, Human  Animal Learning, Language Cognitive Science, Memory, Neural Coding, Neuropsychology, Neuroscience, Perception, Plasticity  Adaptation, Problem Solving, Reasoning, Spike Train Generation,  Synaptic Modulation'
    

Probabilistic_Methods= 'Bayesian Nonparametrics, Bayesian Theory, Belief Propagation, Causal Inference, Distributed Inference, Gaussian Processes, Graphical Models, Hierarchical Models, Latent Variable Models, MCMC, Topic Models,  Variational Inference'


Optimization= 'Combinatorial Optimization, Convex Optimization, Non-Convex Optimization,  Submodular Optimization'



Reinforcement_Learning=  'Planning Decision Control, Exploration, Hierarchical RL, Markov Decision Processes, Model-Based RL, Multi-Agent RL, Navigation'


Theory= 'Competitive Analysis, Computational Complexity, Control Theory, Frequentist Statistics, Game Theory  Computational Economics, Hardness of Learning  Approximations, Information Theory, Large Deviations  Asymptotic Analysis, Learning Theory, Regularization, Spaces of Functions  Kernels,  Statistical Physics of Learning'


Social_Aspects=' AI Safety Fairness, Accountability,  Transparency Privacy, Anonymity,  Security'
 

In [None]:
pred=pd.DataFrame(final_df.groupby('Dominant_Topic').first())

Num=pd.DataFrame(np.array((range(1,11))))


pred_new=pd.concat([Num,pred["Topic_Keywords"]],axis=1)

pred_new.columns=['Topic','Keywords']

pred_new

In [None]:
Keywords=pd.DataFrame([Algorithms,Applications,Data,Deep_Learning,Neuroscience_Cognitive_Science,
Probabilistic_Methods,
Optimization,
Reinforcement_Learning,
Theory,
Social_Aspects,])
Keywords

In [None]:
labels=pd.DataFrame(['Algorithms','Applications','Data','Deep_Learning','Neuroscience_Cognitive_Science',
'Probabilistic_Methods',
'Optimization',
'Reinforcement_Learning',
'Theory',
'Social_Aspects',])

In [None]:
gold_std=pd.concat([labels,Keywords],axis=1)
gold_std.columns=['Topics','Keywords']


In [None]:
gold_std

In [None]:
pred_new['Keywords'][5]
len(final_df)

converting Topic Distribution into feature matrix for classifier

In [None]:
train_vecs = []
for i in range(len(train)):
    top_topics = lda2.get_document_topics(corpus2[i], minimum_probability=0.0)
    train_vecs.append(top_topics)
    
    

In [None]:
X=[]
X = np.array((train_vecs))

X.shape



In [None]:
X[:,:,0]

**Training a classifier based on the generated labels.**

Train and test split

In [None]:
from sklearn.model_selection import train_test_split



y=final_df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(
     X[:,:,1],y, test_size=0.33,random_state=42)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

model=MultinomialNB()

model.fit(X_train,y_train)


**Predictions**

In [None]:
predictions = model.predict(X_test)

In [None]:
predictions
y_test

In [None]:

### from sklearn.metrics import classification_report
print(classification_report(predictions,y_test))

In [None]:

X2=gold_std["Keywords"]
y2=gold_std["Topics"]

X2

In [None]:
pipeline=[]
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])
pipeline.fit(X2,y2)


In [None]:
predictions2=[]
predictions2 = pipeline.predict(final_df['Topic_Keywords'])


predictions2

In [None]:
naive_bayes=pd.DataFrame(pd.concat([pd.DataFrame(predictions2),final_df],axis=1))

In [None]:

from sklearn.linear_model import LogisticRegression
pipeline2 = Pipeline([
    ('bow', CountVectorizer(stop_words='english')),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
pipeline2.fit(X2,y2)

In [None]:
predictions_logst_reg = pipeline2.predict(final_df["Topic_Keywords"])
logst_regr=pd.DataFrame(pd.concat([pd.DataFrame(predictions_logst_reg),final_df],axis=1))

In [None]:
from sklearn.svm import LinearSVC
SVC_pipeline = Pipeline([
    ('bow', CountVectorizer(stop_words='english')),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier',LinearSVC()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])
SVC_pipeline.fit(X2,y2)

In [None]:
pred_svm=SVC_pipeline.predict(final_df["Topic_Keywords"])

In [None]:
svm_df=pd.DataFrame(pd.concat([pd.DataFrame(pred_svm),final_df],axis=1))


In [None]:
svm_df

In [None]:
pd.DataFrame(svm_df['Topic_Keywords'].loc[(svm_df[0] == "Algorithms")]).groupby('Topic_Keywords').Topic_Keywords.count()

In [None]:
final_df.groupby("Dominant_Topic")

Mapping obtained labels and predicted labels with some analysis and domain knowledge

In [None]:
df=final_df

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 0)] = "Applications"

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 1)] = "Optimization"

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 2)] = "Neuroscience_Cognitive_Science"

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 3)] = "Algorithms"

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 4)] = "Social_Aspects"

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 5)] = "Probabilistic_Methods"

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 6)] = "Reinforcement_Learning"

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 7)] = "Deep_Learning"

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 8)] = "Theory"

df['Dominant_Topic'].loc[(df['Dominant_Topic'] == 9)] = "Data"



In [None]:
df['Topic_Keywords'].loc[(df['Dominant_Topic']=='Algorithms')]

In [None]:
print(classification_report(df["Dominant_Topic"],predictions2,zero_division=0))

In [None]:
MLP_pipeline=[]
from sklearn.neural_network import MLPClassifier
MLP_pipeline = Pipeline([
    ('bow', CountVectorizer(stop_words='english')),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier',MLPClassifier(max_iter=9,hidden_layer_sizes=(1200,1200,1200),random_state=100)),  # train on TF-IDF vectors w/ Naive Bayes classifier
])


In [None]:
MLP_pipeline.fit(X2,y2)
pred_mlp=MLP_pipeline.predict(final_df["Topic_Keywords"])

In [None]:
print(classification_report(df["Dominant_Topic"],pred_mlp,zero_division=0))

From the above results we can see that th

In [None]:
train=papers.dropna().reset_index()

from functools import reduce
def tn(sentence):
  return reduce(lambda x,y: x+y, [nlp.vocab[w].vector for w in sentence.split()])

This part was only experimental and not part of the project wanted to explore this a bit but didnt make sense so ignore it

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
train=papers.dropna().reset_index()
model=[]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(train['full_text'])
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

In [None]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

In [None]:
clusters=[]

for i in range(true_k):
    for ind in order_centroids[i, :10]:
        clusters.append(" %s" % terms[ind])

In [None]:
x = np.reshape(clusters, (10, 10)).T
clusters=pd.DataFrame(x)

plt.figure( figsize=(10,20) )
for i in range(len(clusters)):
    plt.subplot(6,2,i+1)
    wordcloud2 = WordCloud().generate(' '.join(clusters[i]))
    plt.imshow(wordcloud2)
    plt.axis("off")
    plt.title("Cluster #" + str(i+1))
    

    
plt.savefig(f"terms_all.png", bbox_inches='tight')      

plt.show()