In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import re
import wordcloud
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<img src='https://aws1.discourse-cdn.com/standard11/uploads/ainetwork/optimized/1X/b6b7a4a8e9603919a4dbace78c13e9ec92c3dc7d_2_689x492.png' />
<h1>Project Description</h1>
The NIPS conference (Neural Information Processing Systems) is one of the most prestigious yearly events in the machine learning community. At each NIPS conference, a large number of research papers are published. Over 50,000 PDF files were automatically downloaded and processed to obtain a dataset on various machine learning techniques. These NIPS papers are stored in datasets/papers.csv. The CSV file contains information on the different NIPS papers that were published from 1987 until 2017 (30 years!). These papers discuss a wide variety of topics in machine learning, from neural networks to optimization methods and many more.

# Load the dataset.

- Import the pandas library.
- Load the papers.csv file from datasets/papers.csv and assign it to the papers variable.
- Print the first rows of the DataFrame with the head method to verify the file was loaded correctly.

In [None]:
papers = pd.read_csv('../input/nips-papers/papers.csv')
papers.head()

In [None]:
papers.info()

# Preparing the data for analysis
- Remove the id, event_type and pdf_name columns.
- Print the first rows of the DataFrame with the head method.

In [None]:
papers.drop(['id', 'event_type', 'pdf_name'], axis=1, inplace=True)

In [None]:
papers.head()

# Plotting how machine learning has evolved over time
- Group the papers by year.
- Count the number of papers per group (i.e. per year).
- Visualise these counts per year in a bar plot.

In [None]:
groups = papers.groupby(['year'])
counts = groups.size()
counts.plot(figsize=(10,10), kind='bar')

# Preprocessing the text data
- Load the regular expression library (re).
- Convert the titles to lowercase using a "map" operation.
- Print the processed titles to verify the results.


In [None]:
print(papers['title'].head())
# remove punctuation
papers['title_processed'] = papers['title'].map(lambda sentence: re.sub('[,\.!?]','',sentence))
#Convert title to lowercase
papers['title_processed'] = papers['title'].map(lambda sentence: sentence.lower())

print(papers['title_processed'].head())

# A word cloud to visualize the preprocessed text data
Transform the data and create a word cloud.
- Load the wordcloud library.
- Convert all the processed titles to a single string.
- Create a WordCloud object.
- Generate a word cloud.

In [None]:
long_string = ''.join(papers['title_processed'])
wc = wordcloud.WordCloud(width=800, height=400)
wc.generate(long_string)
wc.to_image()

# Prepare the text for LDA analysis
The main text analysis method that we will use is latent Dirichlet allocation (LDA). LDA is able to perform topic detection on large document sets, determining what the main 'topics' are in a large unlabeled set of texts. A 'topic' is a collection of words that tend to co-occur often. The hypothesis is that LDA might be able to clarify what the different topics in the research titles are. These topics can then be used as a starting point for further analysis.

LDA does not work directly on text data.First, it is necessary to convert the documents into a simple vector representation. This representation will then be used by LDA to determine the topics. Each entry of a 'document vector' will correspond with the number of times a word occurred in the document. In conclusion, we will convert a list of titles into a list of vectors, all with length equal to the vocabulary. For example, 'Analyzing machine learning trends with neural networks.' would be transformed into [1, 0, 1, ..., 1, 0].

We'll then plot the 10 most common words based on the outcome of this operation (the list of document vectors). As a check, these words should also occur in the word cloud.
- Create a CountVectorizer object with the stop_words='english' argument to remove meaningless words.
- Fit and transform the processed titles with the fit_transform method. Save the results in the count_data variable.
- Plot the most common words with the helper function (plot10mostcommonwords).


In [None]:
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    word_list = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(word_list))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(word_list, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    word_list = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(word_list)) 

    plt.bar(x_pos, counts,align='center')
    plt.xticks(x_pos, word_list, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.title('10 most common words')
    plt.show()

# Initialise the count vectorizer with the English stopwords
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(papers['title_processed'])

# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)

# Analysing trends with LDA
*number_topics* defines the total number of topics in the LDA model. <br>
*number_words* is only for debugging purposes. It is the number of words that will be printed for each topic. For each topic, the most important words for the topic are selected.

In [None]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below (use int values below 15)
number_topics = 10
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics)
lda.fit(count_data)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

# The future of machine learning
Machine learning has become increasingly popular over the past years. The number of NIPS conference papers has risen exponentially, and people are continuously looking for ways on how they can incorporate machine learning into their products and services.

Although this analysis focused on analyzing machine learning trends in research, a lot of these techniques are rapidly being adopted in industry. Following the latest machine learning trends is a critical skill for a data scientist, and it is recommended to continuously keep learning by going through blogs, tutorials, and courses.