In this we will use NIPS dataset to identify the hottest Machine learning topic

In [None]:
import os
file_path = "../input/nips-papers/papers.csv"

In [None]:
import pandas as pd
data = pd.read_csv(file_path)
data.head(4)

In [None]:
data = data.drop(['id','event_type','pdf_name'],axis=1)
data.head(4)

Year wise Papers published

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
counts = data.groupby('year').size()
counts.plot(kind='bar')

Data cleaning

In [None]:
import re
print(data['title'].head())

data['title_cleaned'] = data['title'].map(lambda x: re.sub('[,.\!?]','',x))

data['title_cleaned'] = data['title_cleaned'].str.lower()

print(data['title_cleaned'].head())

Word Cloud

In [None]:
from wordcloud import WordCloud
long_string = ' '.join(data['title_cleaned'])

wordcloud = WordCloud().generate(long_string)
wordcloud.to_image()

Tokenization using count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 

    plt.bar(x_pos, counts,align='center')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.title('10 most common words')
    plt.show()

count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(data['title_cleaned'])
plot_10_most_common_words(count_data,count_vectorizer)

Using LDA to identify the hottest topic

In [None]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below (use int values below 15)
number_topics = 10
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics)
lda.fit(count_data)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)