In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
np.random.seed(2018)
import matplotlib.pyplot as plt

# Reading the data

In [None]:
authors = pd.read_csv("/kaggle/input/nips-papers-1987-2019-updated/authors.csv")
papers  = pd.read_csv("/kaggle/input/nips-papers-1987-2019-updated/papers.csv")

Have a look at the both of the data files that we read.

In [None]:
authors.head()

In [None]:
papers.head()

#### Details about the Data
Detailed information about the data with the number of columns, type of the column, and number of null entries in each column.

In [None]:
papers.info()

papers.isna().sum()

In [None]:
authors.info()

authors.isna().sum()

#### Handling the missing Values.
Since we are taking abstract column and bbuilding a topic model on it, null values in the column is of no use to train the model. Since, you can't replace any value in place of the missing value it's better to drop the rows with missing entries for abstract.

In [None]:
papers_with_abstract = papers.dropna(subset = ["abstract"])

In [None]:
papers_with_abstract.info()

papers_with_abstract.isna().sum()

In [None]:
papers_with_abstract.head()

# Preprocess the Data
  following simple steps to prepare the raw text for model to get better results.  
+ Bringing all the text to lower case.
+ Considering only the words in the text by removing all the punctuations and non alphabetic words.
+ Using porterstemmer, getting the stem of the word.


In [None]:
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words = set(stopwords.words('english'))
porter_stemmer = PorterStemmer()
def preprocess_text(text):
    text = text.lower()
    text = re.sub("[^a-z A-Z]", ' ', text)
    text = [porter_stemmer.stem(word) for word in text.split(' ') if not word in stop_words and word != '']
    return text

In [None]:
processed_abstract = papers_with_abstract["abstract"].map(preprocess_text)

In [None]:
processed_abstract

# Training the Model
Using gensim to build the Latent Dirichlet Allocation (LDA) as topic model to find the top words of the topic to which the document belongs to.

In [None]:
# !pip install gensim
from gensim import corpora, models, similarities

Prepare the token dictionary out of abstract text, which is used later in creating the Bag of Words corpus for the model.

In [None]:
token_dict = corpora.Dictionary(processed_abstract)
print(token_dict)
token_dict.filter_extremes(no_below=5, no_above=0.2, keep_n=110000)
print(token_dict)

In [None]:
Bow_corpus = [token_dict.doc2bow(doc) for doc in processed_abstract]
# print(Bow_corpus)

In [None]:
# sam = Bow_corpus[1]
# for i in range(len(sam)):
#     print(my_dict[sam[i][0]], sam[i][1])
# tfidf = models.TfidfModel(Bow_corpus)

#### LDA Model with 10 toics

In [None]:
lda_model = models.LdaMulticore( corpus= Bow_corpus,
                                num_topics=10,
                                id2word= token_dict,
                                random_state = 1,
                                passes= 2,
                                workers = 2)

In [None]:
doc_topics = [lda_model[c] for c in Bow_corpus]
len(doc_topics) == len(processed_abstract)

In [None]:
processed_abstract = pd.DataFrame(processed_abstract)
processed_abstract.loc[:,"high_probable_topic"] = np.nan
processed_abstract.loc[:,"topic_probability"] = np.nan
processed_abstract.head()

Mapping the topic with highest probbability for the document along with the probability.

In [None]:
from operator import itemgetter
for idx, doc in enumerate(doc_topics):
    tmp = max(doc, key = itemgetter(1))
    processed_abstract.iloc[idx,1] = tmp[0]
    processed_abstract.iloc[idx,2] = tmp[1]

In [None]:
processed_abstract.head()

Result comprising the topic and probability with the papers data.

In [None]:
processed_abstract.columns = ["p_abstract", "high_probable_topic", "topic_probability"]
result = pd.concat([papers_with_abstract, processed_abstract], axis = 1, join= "inner" )
result.head()

Have a look at the top words in the topic

In [None]:
topics = lda_model.show_topics(formatted= False)
topics_df = pd.DataFrame(topics)
topics_df.columns = ["topic", "top_words"]
topics_df.top_words = topics_df.top_words.apply(lambda x: [word[0] for word in x])
topics_df.head()

In [None]:
output_df = result.loc[:,["source_id", "high_probable_topic"]]
output_df = pd.merge(output_df, topics_df, left_on= "high_probable_topic", right_on= "topic", how = "left")
output_df = output_df.loc[:,["source_id", "top_words"]]
output_df.to_csv("/kaggle/working/output.csv", index= False)
output_df

In [None]:
from wordcloud import WordCloud
import matplotlib.colors as mcolors

def plot_wordcloud(topic):
    cloud = WordCloud(background_color= "white",
                     width= 640,
                     height= 640,
                     prefer_horizontal= 1.0)
    topic_words = dict(topic)
    cloud.generate_from_frequencies(topic_words, max_font_size= 300)
    plt.figure(figsize=[12,10])
    plt.imshow(cloud)
    plt.axis('off')
    plt.margins(x= 0, y= 0)
    plt.show()
    
def plot_wordcloud_abs(abstract):
    cloud = WordCloud(background_color= "white",
                      width= 1366,
                      height= 1080,
                      stopwords= stop_words,
                      prefer_horizontal= 1.0)
    cloud.generate_from_text(abstract)
    plt.figure(figsize=[15,10])
    plt.imshow(cloud)
    plt.axis('off')
    plt.margins(x= 0, y= 0)
    plt.show()

# No_of docs per topic distribution
def plot_topic_dist(data):
    x_pos = np.arange(len(data.keys()))
    plt.figure(figsize=[12,8])
    plt.bar(x_pos, data.values(), color = "blue")
    plt.title('Number of Documents per Topic')
    plt.xlabel('Topics')
    plt.ylabel('Number of Documents')
    
    plt.xticks(x_pos, data.keys())
    
    plt.show()

# Visualizing the Results

##### Distribution of the topics over the Documents.

In [None]:
plot_topic_dist(result.high_probable_topic.value_counts().to_dict())

##### Word Cloud for the top words in the topic. 

In [None]:
# Word Cloud for the Topic 8
plot_wordcloud(topics[8][1])

#### Word Cloud to Highlight most used words in Abstract.

In [None]:
# Word Cloud for the abstrract 
sam = result.iloc[0,3]
plot_wordcloud_abs(sam)