In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# install new spacy ver
!pip install spacy==3.1.1
!python -m spacy download en_core_web_sm

import spacy
spacy.__version__

# File cleaning

First, let's open the list of papers and authors.

In [None]:
papers = pd.read_csv("/kaggle/input/nips-papers-1987-2019-updated/papers.csv")
papers.head()

In [None]:
authors = pd.read_csv("/kaggle/input/nips-papers-1987-2019-updated/authors.csv")
authors.head()

So far, some values are missing from both CSVs. The author dataset misses a couple of institutions and the abstracts are not all presents in the paper dataset.

In [None]:
# Check the NaNs
def show_na_proportion(df, name="df"):
    print(f"Percentage of NaNs for {name}:")
    print((df.isna().sum() / df.shape[0]).mul(100).round(2))
    return df.columns[df.isna().sum() > 0]

na_col_papers = show_na_proportion(papers, name="papers")
na_col_authors = show_na_proportion(authors, name="authors")

The full text is present for almost every paper. However, 35% of abstracts are missing. This shouldn't impact the analysis as these are present inside the full text. We can thus omit this column.

However, more than 40% of institutions are missing. We can assume the institution either isn't reported, or the paper isn't affiliated to any institution at all.

In [None]:
author_institution_unique_vals = authors.institution.unique()
print(f"Number of unique institution values: {author_institution_unique_vals.size}")
print("for values:", author_institution_unique_vals)

Let's deal with those missing values...

In [None]:
from sklearn.impute import SimpleImputer

no_na_papers = SimpleImputer(strategy="constant", fill_value="NO DATA").fit_transform(papers.drop('abstract', axis=1))
no_na_authors = SimpleImputer(strategy="constant", fill_value="NO DATA").fit_transform(authors)

no_na_papers = pd.DataFrame(data=no_na_papers, columns=[col for col in papers.columns if col != 'abstract'])
no_na_authors = pd.DataFrame(data=no_na_authors, columns=authors.columns)

show_na_proportion(no_na_papers, name="papers (no NaN)")
show_na_proportion(no_na_authors, name="authors (no NaN)")

# Author analysis

One might observe the repartition between
- The papers and the institution
- The authors and the institution
- The number of papers per author

This analysis might help us identify the big shots of Deep Learning research.

To avoid any confusions between the names, we'll create a column titled `full_name`, where the first and last names are combined.

In [None]:
no_na_authors["full_name"] = no_na_authors.apply(lambda row: row.first_name + " " + row.last_name, axis=1) 
no_na_authors.head()

Now we can see the top 10 of the authors who contributed the most in NeurIPS articles:

In [None]:
import matplotlib.pyplot as plt

top_10_contributors = no_na_authors.full_name.value_counts().head(10)

fig, ax = plt.subplots(figsize=(10,10))
ax.bar(top_10_contributors.index, top_10_contributors.values)
ax.set_xlabel("Authors")
ax.set_ylabel("N° of contributions")
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
plt.show()

As we can see, Michael Jordan is by far the author who contributed the most. A quick research in the data shows he works for UC Berkeley among the publications he figures in.

The same goes for the rest of the top 10: they all work at prestigious schools and corporations like Cambridge, Montreal or Google.

In [None]:
top_10_auth_inst = set([inst for author in top_10_contributors.index for inst in no_na_authors[no_na_authors.full_name == author].institution.unique().tolist()])
top_10_auth_inst = list(top_10_auth_inst)

print("List of institutions from the top 10:", top_10_auth_inst)

In [None]:
top_10_institutions = no_na_authors.institution.value_counts().head(11)

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10,10))

ax1.bar(["Not referenced", "Referenced"],[top_10_institutions.values[0], top_10_institutions.values[1:].sum()])
ax1.set_xlabel("Institutions")
ax1.set_ylabel("N° of authors")

ax2.bar(top_10_institutions.index[1:], top_10_institutions.values[1:])
ax2.set_xlabel("Institutions")
ax2.set_ylabel("N° of authors")
plt.setp(ax2.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
plt.show()

The top 10 of institutions seems to corroborate that fact.

# NeurIPS Papers

Each paper so far has a title, a year of publication and a text content. The first thing we can notice so far is how fast interest for Deep Learning research evolves between yearly congresses:

In [None]:
papers_per_year = no_na_papers.year.value_counts()

fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter(papers_per_year.index, papers_per_year.values)
ax.set_xlabel("Year")
ax.set_ylabel("Number of publications")
plt.plot()

The graph reveals the interest for deep learning grows slightly at first, then rises fastly in the second part of the 10s, most likely because ML and data science generated a hype wave thanks to the biggest accomplishments from neural networks.

Out of interest, we can pick the most recent article Michael Jordan wrote or contributed to:

In [None]:
first_article = no_na_authors[no_na_authors.full_name == "Michael Jordan"].join(no_na_papers.set_index('source_id'), 
                                                                                how='inner', on='source_id').sort_values('year', ascending=False).head(1)
first_article

There's several articles from 2019. Let's pick this one and see how the title and full text are. The text is quite long so let's limit it to 5000 characters.

In [None]:
title = first_article.title.values[0]
text = first_article.full_text.values[0]

print(title)
print(text[:2500], "...", text[-2500:],sep='\n')

According to the description, the dataset was scraped from the HTML version, thus explaining why the text has more spaces than needed. On the other hand, it contains a plethora of informations:

* The title
* Details about the authors
* The abstract
* The different sections of an article, from the introduction to the conclusion
* A list of references

In a scientific article, the title and the abstract are the most important parts of the discovery of a topic. An abstract must contain a brief summary of the article describing the problematic, the used method, the dataset, and a summary of the final results. Meanwhile, a title is a way to promote the article using **keywords**.

In [None]:
kw_sample = [word.lower().replace(":", "") for word in title.split() if word.lower() != "of"]
kw_sample

# Sorting keywords

Using just a word counter, we can encapsulate in a dataframe the number of relevant words per title. First we're going to determine the word quantity among these documents.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
title_vectorized = vectorizer.fit_transform(no_na_papers.title)

feature_count = pd.DataFrame({
    "feat_name": vectorizer.get_feature_names(), 
    "feat_count": title_vectorized.toarray().sum(axis=0).tolist()
})

feature_count.sort_values(by='feat_count', ascending=False).head(20)

In [None]:
from wordcloud import WordCloud
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 10))

# Only limit the plot to the nth entry
n_entries = 75

vals_to_display = feature_count.sort_values(by='feat_count', ascending=False).head(n_entries)

ax2.bar(x=vals_to_display.feat_name.values, height=vals_to_display.feat_count.values)

# define a wordcloud
wc = WordCloud(background_color="white")
wc.generate(" ".join(vectorizer.get_feature_names()))

ax1.imshow(wc, interpolation="bilinear")
ax1.axis("off")

plt.setp(ax2.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")
plt.show()

As we can see, the most common terms come from either stop words or particular terms from Deep Learning. In the top 10, we can see, aside from stop words, that words like "deep", "neural" and "networks" often come by in the words. One can deduce, according to the thematic, that these can form the following words :

* Neural Networks
* Deep Neural Networks
* Neural Networks model
* etc...

As such, it's more appropriate to use a 2 or 3-gram count vectorizer.

In [None]:
feature_count = {
    "feat_name": [],
    "feat_count": [],
    "feat_n_gram": []
}

title_sw = ["for", "of", "and", "with", "in", "the", "to", "on", "from", "via", "by", "an"]

for i_gram in range(1,4):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(i_gram, i_gram), stop_words=title_sw)
    title_vectorized = vectorizer.fit_transform(no_na_papers.title)

    feature_count["feat_name"].extend(vectorizer.get_feature_names())
    feature_count["feat_count"].extend(title_vectorized.toarray().sum(axis=0).tolist())
    feature_count["feat_n_gram"].extend([i_gram] * len(vectorizer.get_feature_names()))
    
feature_count = pd.DataFrame(feature_count)
feature_count.sort_values(by='feat_count', ascending=False).head(20)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 10))
tab_colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 
              'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']

# Only limit the plot to the nth entry
n_entries = 25

vals_to_display = feature_count.sort_values(by='feat_count', ascending=False).head(n_entries)

ax1.bar(x=vals_to_display.feat_name.values, height=vals_to_display.feat_count.values, 
        color=vals_to_display.feat_n_gram.apply(lambda i_gram: tab_colors[i_gram-1]).values)

feature_count.feat_n_gram.value_counts(ascending=True).plot.pie(ax=ax2)

plt.setp(ax1.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")


plt.show()

The code so far shows a higher proportion of 3 n-grams (more specific terms) and a more frequent proportion of single words terms. The trick with scientific articles is estimating the right number of n-gram words, as terms in research can get very specific, especially when it comes to presenting a variation of an already practiced method. Some titles can use acronyms but these are rare.

In [None]:
for i_gram in range(1,4):
    top_50 = feature_count[feature_count.feat_n_gram == i_gram].sort_values(by='feat_count', ascending=False).head(50)
    print(f"Top 20 of the most common {i_gram}-gram words:\n{', '.join(top_50.feat_name.tolist())}\n")

From the top 50 words, we can deduce that a 1-gram research regroups common scientific terms like "kernel", "variational" or "reinforcment", without specifying exactly the kind of topic we would find, while an increasing number of n-grams would narrow down the topic. 

We can see that selecting "deep reinforcment learning" would lead us to articles talking about Q-learning and it's variants, while "reinforcement" could not only mean "reinforcment learningé, but imply that a DL model can be reinforced.

We can always check the diversity of 3-gram words per year, which would show a constant evolution of the Deep Learning field:

In [None]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(3, 3), stop_words=title_sw)

X = vectorizer.fit_transform(no_na_papers.title)

diversity_papers = no_na_papers.loc[:, ("title", "year")]

diversity_papers["n_themes"] = diversity_papers.index.values
diversity_papers["n_themes"] = diversity_papers["n_themes"].apply(lambda row: np.nonzero(X[row,:])[1].tolist())

diversity_papers.head()


In [None]:
yr_div_evolution = diversity_papers.groupby('year').sum().n_themes.apply(lambda row: len(list(set(row))))
fig, ax = plt.subplots(figsize=(12, 6))

yr_div_evolution.plot(ax=ax)
ax.set_title("Evolution of theme diversity through the years")
plt.show()

As expected, as research in Deep Learning goes, the further the knowledge of the field gets refined. Notice the exponential growth starting from 2015 onwards, when AI is becoming a hype trend.

We can also observe the increase and decrease in themes according to the year, when desired words started to appear. Let's try with "graph neural networks":

In [None]:
def trace_word_popularity(word):
    is_word_in_list = np.array([feat==word for feat in vectorizer.get_feature_names()])
    if not is_word_in_list.any():
        raise Exception("Couldn't find the word you were looking for.")
        
    if len(word.split()) != 3:
        raise Exception("Must be a 3 word long token separated by spaces.")
        
    word_idx = np.argwhere(is_word_in_list).item()
    count_df = (diversity_papers.groupby('year').sum().n_themes
                .apply(lambda row: np.array(row))
                .apply(lambda row: row[row == word_idx].size))
    fig, ax = plt.subplots(figsize=(12, 6))

    count_df.plot(ax=ax)
    ax.set_title(f"Evolution of '{word}' through the years")
    plt.show()
    


test_word = "graph neural networks"
trace_word_popularity(test_word)

Nothing before 2016. Which makes sense since GNNs are a recent deep learning neural network.

In [None]:
from matplotlib import cm
from matplotlib.colors import LinearSegmentedColormap, ListedColormap

def make_segmented_cmap(cmap_name, n_segments):
    cmap = cm.get_cmap(cmap_name, n_segments)
    if isinstance(cmap, LinearSegmentedColormap):
        cmap = cmap(range(n_segments))
    else:
        cmap = cmap.colors
    return cmap

def words_trend_3_gram(words, cmap_name='autumn'):
    cmap = make_segmented_cmap(cmap_name, len(words))
    fig, ax = plt.subplots(figsize=(12, 6))
    for idx, word in enumerate(words):
        is_word_in_list = np.array([feat==word for feat in vectorizer.get_feature_names()])
        if not is_word_in_list.any():
            raise Exception("Couldn't find the word you were looking for.")

        if len(word.split()) != 3:
            raise Exception("Must be a 3 word long token separated by spaces.")

        word_idx = np.argwhere(is_word_in_list).item()
        count_df = (diversity_papers.groupby('year').sum().n_themes
                    .apply(lambda row: np.array(row))
                    .apply(lambda row: row[row == word_idx].size))
        
        count_df.plot(ax=ax, color=cmap[idx], label=word)
        
    ax.set_title(f"Evolution of {len(words)} terms through the years")
    ax.legend(bbox_to_anchor=(1,1), loc="upper left")
    plt.show()

# testing on the previous top_50:
words_trend_3_gram(top_50.feat_name.tolist()[:15], cmap_name='copper')

From the top 15 it's hard to deduce and compare individually how each term fares compared to each other. There doesn't seem to be a decline in a thematic or another. However, the most popular term, "deep neural networks", is on the rise. Mostly because it's the main thematic of NeurIPS papers...

# Analyzing paper contents

Last but not least, we'll try and decipher the contents of the full papers within the CSV file. The first thing I'm interested in is seeing the character range between all these papers.

In [None]:
# Cleaning
paper_length = no_na_papers.full_text.apply(lambda text: len(text))

print(f"Mean number of characters: {paper_length.describe().loc['mean']:.2f}")
print(f"The number of characters goes from {paper_length.describe().loc['min']:.0f}",
      f"to {paper_length.describe().loc['max']:.0f}")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,6))

ax1.boxplot(paper_length.values, 0, 'rs', 0)
ax1.set_title("Boxplot of full text length")

ax2.boxplot(paper_length.values, 0, '', 0)
ax2.set_title("Boxplot of full text length\n(No outliers)")

plt.show()

This is very strange indeed! The shortest text is 1 character long, while the longest ones exceed 100k characters. If you remove the outliers as indicated by the boxplot, the average article is around 25-35k characters, which is consistent with the average Arxiv article.

## Filter outliers

The shortest articles are less than 10k characters. Let's analyze the top 10 of the shortest texts:

In [None]:
for i, row in paper_length.sort_values(ascending=True).head(10).iteritems():
    print(f"Text n°{i}, {row} characters")
    print(no_na_papers.full_text.iloc[i])

As we can see, the data is either missing or is outright saying gibberish. Probably as a result of the metadata missing or the request failing for a mysterious reason (eg. the hashes aren't registered). I won't display more for space reasons , but if you show a full text at random, you can see some of the shortest documents in the top 20 are indeed real documents that are just slightly shorter than the average article.

Another thing that I've noticed, especially if you open large articles, is the amount of whitespaces left by the text, which results in a disproportionnate number of characters. Try opening the largest article and see that it has a lot of whitespace in some of its parts. My guess is the metadata couldn't constrain $\LaTeX$ expressions and tables. Figures could also be the result. 

In some cases, it's most likely due to the paper's styling. Some papers put extra tabs and spaces to delimit author names and affiliations in example. Another explanation is extra space being left for the columns, although Arxiv papers hardly ever do this.

In [None]:
# Whitespace ratio
import re

def whitespace_ratio(text: str, verbose: bool = False) -> float:
    """
    Takes a text and estimates the number of whitespaces over the
    number of characters. Uses the common token `\s` for estimation.
    """
    len_text = len(text)
    count_whitespaces = 0
    
    for match in re.finditer(r'\s', text, re.MULTILINE):
        count_whitespaces += (match.end() - match.start())
        
    if verbose:
        print(f"Total whitespaces: {count_whitespaces} / {len_text} ({count_whitespaces/len_text:.2%})")
        
    return count_whitespaces / len_text

longest_paper = no_na_papers.full_text.iloc[paper_length.sort_values(ascending=True).tail(1).index.item()]

_ = whitespace_ratio(longest_paper, verbose=True)

As we can see, the largest paper so far is just blank space. Only 1.5% of the document is actually exploitable text, that is, if you don't count the equations.

~~*Note: the following cell might take a while to run due to the iterative process. Statistics made with `tqdm` estimate 1:40h of calculations. The operation is using multiprocessing calculations to get the job done. If you want to reproduce the same stepw ith your CPU, you can use `os.cpu_count()` on your machine and change `n_cores`.*~~

**ERRATUM:** The miscalculation might be due to an old variable being stuck. You can use apply but I'll let the multiprocess method just to show how you can clean up large walls of text in case you need more power.

In [None]:
from typing import Callable
from multiprocessing import Pool
from functools import partial

# Function to parallelize the series
# Many thanks to Rahul Agarwal for the tip
# https://towardsdatascience.com/make-your-own-super-pandas-using-multiproc-1c04f41944a1
def parallelize_series(s: pd.Series, func: Callable, n_cores: int = 4) -> pd.Series:
    """
    Uses Kaggle's CPU cores to divide the work and accelerate the process.
    """
    s_split = np.array_split(s, n_cores)
    pool = Pool(n_cores)
    s = pd.concat(pool.map(func, s_split))
    pool.close()
    pool.join()
    return s

# Using this step because `Pool.map` doesn't picklize lambdas
def transform_series(s: pd.Series, f: Callable) -> pd.Series:
    """
    Calls pandas Series' apply method.
    """
    return s.apply(f)

%time paper_ws_ratio = parallelize_series(no_na_papers.full_text, partial(transform_series, f=whitespace_ratio), n_cores = os.cpu_count())

# Then we sort the values to show the results
paper_ws_ratio.sort_values().tail(20)

Let's see the correlation between the number of whitespaces and the number of characters. I expect a positive correlation as the longest papers are full of white spaces.

In [None]:
interm_result = pd.concat([paper_length.rename('length'), paper_ws_ratio.rename('ws_ratio')], axis=1).corr()

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 14))

ax1.scatter(paper_length.values, paper_ws_ratio.values, alpha=0.3)
ax1.plot([0, 1], [0, 1], transform=ax1.transAxes, color='red')
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.set_title("Plot paper length against whitespace ratio")
ax1.set_xlabel("Paper length (characters)")
ax1.set_ylabel("Whitespace ratio")

ax2.imshow(interm_result, cmap='copper')
ax2.set_xticks(np.arange(interm_result.shape[1]))
ax2.set_yticks(np.arange(interm_result.shape[1]))
ax2.set_xticklabels(interm_result.columns.values)
ax2.set_yticklabels(interm_result.columns.values)
for i in range(interm_result.shape[1]):
    for j in range(interm_result.shape[1]):
        text = ax2.text(j, i, interm_result.iloc[i, j].round(4), fontsize=14,
                       ha="center", va="center", color="w")
ax2.set_title("Correlation matrix")

ax3.boxplot(paper_length.values, 0, 'rs', 0)
ax3.set_title("Distribution of paper length")

ax4.boxplot(paper_ws_ratio.values, 0, 'rs', 0)
ax4.set_title("Distribution of whitespace ratio")

plt.show()

The correlation coefficient doesn't show a linear tendency between the whitespace ratio and the paper length. Instead, it looks like the scatter plot reveals two clusters in the distribution where the link between paper length and whitespace ratio is above the norm. There's also a small cluster under the red line where the whitespace ratio is neglectible, indicating a compact paper. 

The boxplots indicate the whitespace ratio is around 18% and spreads between 15 and 20%, which coincides with the diameter of the blob we see in the top-left graph.

In [None]:
from sklearn.cluster import KMeans

paper_ws = pd.concat([paper_length.rename('length'), paper_ws_ratio.rename('ws_ratio')], axis=1)
kmeans = KMeans(n_clusters=2, random_state=1).fit(paper_ws.apply(np.log))

fig, ax = plt.subplots(figsize=(10, 10))
paper_ws['clusters'] = kmeans.labels_
kmeans_cmap  = ['tab:blue', 'tab:orange']

for c in range(2):
    ax.scatter(paper_ws[paper_ws.clusters == c].length, paper_ws[paper_ws.clusters == c].ws_ratio, 
               color=kmeans_cmap[c], alpha=0.25, label=f"Label {c+1}")

ax.scatter(np.exp(kmeans.cluster_centers_)[:,0], np.exp(kmeans.cluster_centers_)[:,1], marker='x', color='red')
    
ax.plot([0, 1], [0, 1], transform=ax.transAxes, color='red')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_title("Plot paper length against whitespace ratio")
ax.set_xlabel("Paper length (characters)")
ax.set_ylabel("Whitespace ratio")

ax.legend()
plt.show()

In [None]:
compact_text = no_na_papers.iloc[paper_ws[paper_ws.clusters == 1].index].full_text.head(5)

for _, row in compact_text.iteritems():
    print(row[:1000], '\n')

Would you look at that! It seems not only some texts have exclusively white text, but others are at the opposite end of the spectrum: too compact. 

Luckily, the following cell estimates they're in the minority:

In [None]:
n_clusters, n_cluster_counts = np.unique(paper_ws.clusters, return_counts=True)

for cluster, c_count in zip(n_clusters, n_cluster_counts):
    print(f"Number of elements from cluster {cluster+1}: {c_count} ({c_count / no_na_papers.shape[0]:.2%})")

## Clean whitespaces

Only 0.6% of these texts are the ones where the spaces are missing. Excluding them with the first few entries where the full text is unavailable won't damage the analysis. But what about the other category where there's too much spaces?

We have to find a way to replace the spaces without damaging the readability. Here's my strategy:

* Replace tabs and spaces by a single space
* Keep the newline characters `\n` but always regroup them so we don't have a spaced out text

Sounds like a reasonable strategy.

In [None]:
print("Raw paper")
print(longest_paper[:2000])

print(f"\n{'Replace whitespaces by spaces':=^120}\n")
lp_no_spaces = re.sub(r"[ \t\f\r]+", " ", longest_paper)
print(lp_no_spaces[:2000])

print(f"\n{'Keep only one newline':=^120}\n")
lp_clean = re.sub(r"\n+", "\n", lp_no_spaces)
print(lp_clean[:2000])

print(f"\n{'':=^120}\n{len(longest_paper)} at the beginning, {len(lp_clean)} at the end!")
print(f"{(len(longest_paper) - len(lp_clean))/len(longest_paper):.2%} removed!")

There we go! The text looks much more clean so far. Let's try that cleaning method on the whole dataset.

In [None]:
def clean_whitespace(text: str) -> str:
    """
    Removes the whitespace on the text
    """
    txt_no_spaces = re.sub(r"[ \t\f\r]+", " ", text)
    return re.sub(r"\n+", "\n", txt_no_spaces)

full_text_clean = no_na_papers.full_text.apply(clean_whitespace)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16,6))

ax1.boxplot(full_text_clean.apply(lambda text: len(text)).values, 0, 'rs', 0)
ax1.set_title("Boxplot of full text length (clean)")

ax2.boxplot(paper_length.values, 0, 'rs', 0)
ax2.set_title("Boxplot of full text length")
      
ax3.boxplot(full_text_clean.apply(lambda text: len(text)).values, 0, '', 0)
ax3.set_title("Boxplot of full text length (clean)\nNo outliers")

ax4.boxplot(paper_length.values, 0, '', 0)
ax4.set_title("Boxplot of full text length\nNo outliers")

plt.tight_layout()
plt.show()

The distribution hasn't really shifted but at least the worst outliers were readjusted.

## Detecting gibberish

So far we managed to clean the spaces and detect compact text. Now what about badly parsed text? Despite every article being read and understood with Python's `print()` function, there can be instances of badly parsed text, like the 10th shortest text including only numbers and punctuation.

One strategy would be to take any non-whitespace character, then evaluate the proportion between numbers and punctuation, all over the clean text.

In [None]:
from typing import Tuple
import string

data = {
    "total": [],
    "n_digits": [],
    "n_punct": []
}

def char_proportion(text: str) -> Tuple[int, int, int]:
    """
    Returns a tuple with the following
    - Number of non-space characters
    - Number of digits
    - Number of punctuation characters
    """
    count_chrs = 0
    count_digit = 0
    count_punct = 0
    
    # Pattern for punctuation with Python's string module
    punct_patt = re.compile(fr"[{string.punctuation}]+", re.M)
    
    for match in re.finditer(r'\S+', text, re.MULTILINE):
        count_chrs += (match.end() - match.start())
        non_space_txt = match.group()
        for dig_match in re.finditer(r'\d+', non_space_txt, re.MULTILINE):
            count_digit += (dig_match.end() - dig_match.start())
        
        for punct_match in punct_patt.finditer(non_space_txt):
            count_punct += (punct_match.end() - punct_match.start())
        
        
    return (count_chrs, count_digit, count_punct)


for _, row in full_text_clean.apply(char_proportion).iteritems():
    tot, n_dig, n_pun = row
    data["total"].append(tot)
    data["n_digits"].append(n_dig)
    data["n_punct"].append(n_pun)

chr_prop = pd.DataFrame(data)
chr_prop['n_other'] = chr_prop.total - (chr_prop.n_digits + chr_prop.n_punct)


fig, ax = plt.subplots(figsize=(10, 10))
chr_prop[['n_digits', 'n_punct', 'n_other']].sum().plot.pie(ax=ax)
ax.set_ylabel("Proportion of digits and punctuation vs other characters")
plt.plot()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 6))

ax1.hist((chr_prop.n_digits / chr_prop.total).mul(100).values, bins=200)
ax1.set_title("Histogram of digit-character ratio")
ax1.set_xlabel("Value (%)")
ax1.set_ylabel("Intensity")
ax1.set_xlim(0, 100)

ax2.hist((chr_prop.n_punct / chr_prop.total).mul(100).values, bins=200)
ax2.set_title("Histogram of punctuation-character ratio")
ax2.set_xlabel("Value (%)")
ax2.set_ylabel("Intensity")
ax2.set_xlim(0, 100)

ax3.hist(((chr_prop.n_punct + chr_prop.n_digits) / chr_prop.total).mul(100).values, bins=200)
ax3.set_title("Histogram of letter vs not-letter ratio")
ax3.set_xlabel("Value (%)")
ax3.set_ylabel("Intensity")
ax3.set_xlim(0, 100)

plt.show()

Looks like the proportion is relatively reasonable (the distribution is centered on 8-10% for each). Which means most of the dataset is relatively clean. Counting the portion where digits and punctuations combined exceed 25% will only display 26 entries. Let's see hwo they look like.

In [None]:
chr_no_chr_prop = ((chr_prop.n_punct + chr_prop.n_digits) / chr_prop.total)
for _, txt in full_text_clean.iloc[chr_no_chr_prop[chr_no_chr_prop > 0.25].index].head(5).iteritems():
    print(txt[:20], '\n...\n', txt[-20:], '\n')

As we can see, some of this text is unintelligible. Using everything we applied so far, we can set up our full dtataset of cleaned full text with

* Removing the shortest entries (total length under 1000 characters)
* Removing the compact text (no space at all)
* Removing the unintelligible entries (non-letter/letter ratio over 25%)

In [None]:
# Next delete the compact text and the first entries (<1500 characters)
shortest_papers_idx = paper_length[paper_length < 1500].index
compact_text_idx = paper_ws[paper_ws.clusters == 1].index
gibberish_idx = chr_no_chr_prop[chr_no_chr_prop > 0.25].index

idx_to_drop = list(set(shortest_papers_idx.to_list() + compact_text_idx.to_list() + gibberish_idx.to_list()))
idx_to_drop = pd.Index(idx_to_drop)

full_text_final = full_text_clean.drop(idx_to_drop, axis=0)
print(f"{idx_to_drop.size} of {full_text_clean.shape[0]} entries removed")

It looks like we only filtered a small amount of papers, which is good for an eventual study.

# spaCy visualization

Last but not least, let's try to visualize how well the spaCy library fares over one of those texts. 

In [None]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(full_text_final.sample(n=1).item())
displacy.render(doc[:1500] , style="ent", jupyter=True)

So far, spaCy's English module manages to find persons and orgs, but loses itself in-between equation residuals and number references. So far, rearranging equations from raw text is easier said than done, mostly because we don't know how they were originally written, and because we don't have the $\TeX$ source code, which could've helped us generate a SVG and put aside as a figure.