# PDF text extraction tool
Author: Roald Teunissen

## Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import json

# Scikit learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# DuckDB for fast quering
import duckdb

# Word embedding
import gensim
from gensim.models import Word2Vec #Word2Vec is mostly used for huge datasets

# Natural Language Toolkit
import nltk
from nltk.tokenize import word_tokenize

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

## Properties

In [2]:
DATA_DIR = os.path.join(os.getcwd(), '../../data/')
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'processed')

PARQUET_NAME = 'paper_data.parquet'

conn = duckdb.connect()

# Load data

In [3]:
data = pd.read_parquet(os.path.join(RAW_DATA_DIR, PARQUET_NAME), engine='fastparquet')
data = data.rename(columns = {'topic': 'subject'})

In [4]:
df_train, df_test = train_test_split(data, test_size=0.2, shuffle=True)

# Exploratory data analysis

In [5]:
print(data.shape)
print(type(data))

(12482, 3)
<class 'pandas.core.frame.DataFrame'>


In [6]:
data.describe()

Unnamed: 0,id,subject,content
count,12482.0,12482,12482
unique,11555.0,8,11563
top,2301.07104,physics,LARGE DEVIATIONS FOR CLASSIFICATION PERFORMANC...
freq,4.0,1805,4


Graphs

In [7]:
subject_index = df_train['subject'].unique()
value_counts = df_train['subject'].value_counts().reindex(index = subject_index)

source = pd.DataFrame({
    'subject_index': subject_index,
    'value_counts': value_counts
})

bar_plot = alt.Chart(source).mark_bar().encode(
    alt.X('subject_index', title = None, axis = alt.AxisConfig(labelAngle = 20)),
    alt.Y('value_counts', title = None, axis = alt.Axis( values = [*range(0,2000, 100)])),
    alt.Color('subject_index', title = 'Subjects', legend = None),
)

text = bar_plot.mark_text(dy = -10, fontSize = 15).encode(
    text = alt.Text('value_counts'),
)

final_bar = (bar_plot + text).properties(
    width = 500,
    height = 400,
    title = 'Number of papers per subject'
).configure_title(
    fontSize=20
)
final_bar

As we can see, _economics_ has the lowest value with `852` and _computer science_ with `1415`.<br>
From this we could either decide to pick 852 items from each subject, but I think it is a better idea to look at the number of words we are going to use for our model, since it relies on words rather than then number of papers.

In [8]:
# Check missing values
df_train.isna().sum()

# There are no missing values, as expected as this data is scraped and extracted from pdf files.

id         0
subject    0
content    0
dtype: int64

In [9]:
df_train['word_count'] = df_train['content'].apply(lambda x: len(str(x).split()))
df_train['char_count'] = df_train['content'].apply(lambda x: len(str(x)))
df_train['unique_word_count'] = df_train['content'].apply(lambda x: len(set(str(x).split())))

In [10]:
# Using DuckDB for fast querying
# DuckDB uses a columnar-vectorized query execution engine, which makes it exceptionally fast for querying complex and aggregations
# (notice that it is finished with querying over thousands of entries in 0.1s)
# I suggest to check out this website if you want to learn more: https://duckdb.org/why_duckdb

query = """
    SELECT
        subject,
        ROUND(SUM(char_count)) as 'char count',
        ROUND(SUM(word_count)) as 'word count',
        ROUND(SUM(unique_word_count)) as 'unique word count',
        ROUND(MEAN(char_count)) as 'char count mean',
        ROUND(MEAN(word_count)) as 'word count mean',
        ROUND(MEAN(unique_word_count)) as 'unique word count mean'
    FROM df_train
    GROUP BY subject
"""
word_overview = conn.execute(query).df()

In [11]:
word_overview

Unnamed: 0,subject,char count,word count,unique word count,char count mean,word count mean,unique word count mean
0,eess,59317154.0,8950997.0,2936831.0,49022.0,7398.0,2427.0
1,mathematics,87662768.0,12339533.0,4081124.0,61909.0,8714.0,2882.0
2,statistics,93781272.0,13841477.0,4280118.0,69211.0,10215.0,3159.0
3,physics,83526609.0,12672723.0,4055735.0,57210.0,8680.0,2778.0
4,q_finance,72865178.0,10805002.0,3401384.0,66181.0,9814.0,3089.0
5,q_biology,74388991.0,11127341.0,3494584.0,61025.0,9128.0,2867.0
6,computer_science,82224530.0,12341016.0,3928884.0,58985.0,8853.0,2818.0
7,economics,68323217.0,10147091.0,2884291.0,82317.0,12225.0,3475.0


### Barplots

In [12]:
def overview_plot(scope:list, graph_title:str, sub_titles:list, graph_width:int = 250, graph_height:int = 300):
    char_count_bar = alt.Chart(word_overview[scope]).mark_bar().encode(
        alt.X('subject', title = None, axis = alt.Axis(labelAngle=30)),
        alt.Y(scope[1], title = None),
        alt.Color('subject')
    ).properties(
        width = graph_width,
        height = graph_height,
        title = alt.TitleParams(
            sub_titles[0],
            fontWeight = 'lighter'
        )
    )
    word_count_bar = alt.Chart(word_overview[scope]).mark_bar().encode(
        alt.X('subject', title = None, axis = alt.Axis(labelAngle=30)),
        alt.Y(scope[2], title = None),
        alt.Color('subject')
    ).properties(
        width = graph_width,
        height = graph_height,
        title = alt.TitleParams(
            sub_titles[1],
            fontWeight = 'lighter'
        )
    )
    unique_word_count_bar = alt.Chart(word_overview[scope]).mark_bar().encode(
        alt.X('subject', title = None, axis = alt.Axis(labelAngle=30)),
        alt.Y(scope[3], title = None),
        alt.Color('subject')
    ).properties(
        width = graph_width,
        height = graph_height,
        title = alt.TitleParams(
            sub_titles[2], 
            fontWeight = 'lighter'
        )
    )

    return (char_count_bar | word_count_bar | unique_word_count_bar).properties(
        title = alt.TitleParams(
            graph_title, 
            anchor = 'middle', 
            fontWeight = 'bold',
            fontSize = 20,
            offset = 10,
            lineHeight = 10
        ),  padding = {'top': 10, 'left': 10, 'right': 10})

In [13]:
overview_plot(['subject', 'char count', 'word count', 'unique word count'], 'Total per subject', ['character', 'word', 'unique word'])

In [14]:
overview_plot(['subject', 'char count mean', 'word count mean', 'unique word count mean'], 'Average per subject (representing the average paper)', ['character', 'word', 'unique word'])

These graphs show a whole different picture I first had in mind. The previous graph showed that _economics_ had the least amount of papers, which we had to scale down to, but it seems that _economics_ has far out the most amount of **characters**,**word count** and **unique word count**. This implies that _economic_ papers tend to be more comprehensive.

# Models

### Word2Vec algorithm

In [15]:
df_train['clean_text_tok'] = [nltk.word_tokenize(i) for i in df_train['content']] # Tokenize content (might take 15mins)

model = Word2Vec(df_train['clean_text_tok'], min_count = 1)

w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))  # Combine word and its vector

# For converting sentence to vectors/numbers from word vectors result by Word2Vec
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

### Train test split

In [20]:
X_train, X_val, y_train, y_val = train_test_split(df_train["content"],
                                                  df_train["subject"],
                                                  test_size = 0.2,
                                                  shuffle = True)

In [21]:
# Convert value
y_val = y_val.to_numpy()

In [22]:
# Tokenize for word2vec
X_train_tok = [nltk.word_tokenize(i) for i in X_train]
X_val_tok = [nltk.word_tokenize(i) for i in X_val]

### TF-IDF preparation

In short:<br>
_Short for **term frequency–inverse document frequency**, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus._

In [23]:
# Initialize vectorizer to prepare the data for our model
tfidf_vectorizer = TfidfVectorizer(use_idf = True)

In [24]:
# Vectorize train and validation data
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)

Notice that we only fit the training data with our vectorizer's `fit_transform()` function.<br>
This will set the word-indexes and weights to match the training data.

### Word2vec preperation

In short: <br>
_As the name implies, **word2vec** represents each distinct word with a particular list of numbers called a vector. The vectors are chosen carefully such that they capture the semantic and syntactic qualities of words; as such, a simple mathematical function (cosine similarity) can indicate the level of semantic similarity between the words represented by those vectors._

In [25]:
# Initialize word embedding model
modelw = MeanEmbeddingVectorizer(w2v)

X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_val_tok)

## Building ML models for text classification

In [26]:
encoder = OrdinalEncoder() 

Confusion matrix for multiple dimensions. This seems to be hard to realize so a solution may be implemented later on.

In [37]:
# color = ['Blues', 'Oranges', 'Reds', 'PuBu', 'Greens', 'YlOrBr', 'Purples', 'RdPu']

# def plot_confusion_matrix(y_test, y_pred, subjects):
#     f, axes = plt.subplots(2, 4, figsize=(25, 10))
    
#     axes = axes.ravel()
#     for i, subject in enumerate(subjects):        
#         disp = ConfusionMatrixDisplay(confusion_matrix(y_test[:, i],
#                                                     y_pred[:, i]),
#                                     display_labels=[0, i])
#         disp.plot(ax=axes[i], values_format='.4g', cmap = color[i])
#         disp.ax_.set_title(subject)
#         if i<10:
#             disp.ax_.set_xlabel('')
#         if i%5!=0:
#             disp.ax_.set_ylabel('')
#         disp.im_.colorbar.remove()

#     plt.subplots_adjust(wspace=0.10, hspace=0.1)
#     f.colorbar(disp.im_, ax=axes)
#     plt.show()

In [27]:
# Reiterate what our unique values are
[[unique_val] for unique_val in df_train["subject"].unique()]

[['eess'],
 ['mathematics'],
 ['statistics'],
 ['physics'],
 ['q_finance'],
 ['q_biology'],
 ['computer_science'],
 ['economics']]

### Logistic Regression with TF-IDF

In [28]:
# Define and fit model
lr_tfidf = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)

# Make a prediction on the validation data
lr_tfidf_y_predict = lr_tfidf.predict(X_val_vectors_tfidf) 
lr_tfidf_y_prob = lr_tfidf.predict_proba(X_val_vectors_tfidf)[:,1] 

# Encode from topics [a,b,c,d] to numbers [1., 2., 3., 4.]
lr_tfidf_y_predict = encoder.fit_transform(lr_tfidf_y_predict.reshape(-1,1))
lr_tfidf_y_val = encoder.fit_transform(y_val.reshape(-1,1))

Analyse the performance our model

In [29]:
print(classification_report(lr_tfidf_y_val, lr_tfidf_y_predict))

              precision    recall  f1-score   support

         0.0       0.53      0.48      0.50       285
         1.0       0.81      0.68      0.74       178
         2.0       0.66      0.69      0.68       242
         3.0       0.68      0.75      0.72       273
         4.0       0.81      0.83      0.82       288
         5.0       0.77      0.76      0.76       245
         6.0       0.85      0.79      0.82       215
         7.0       0.58      0.65      0.61       271

    accuracy                           0.70      1997
   macro avg       0.71      0.70      0.71      1997
weighted avg       0.70      0.70      0.70      1997



With multiple dimensions, plotting a ROC curve might be hard to do. An implementation may come later on.

In [None]:
# fpr, tpr, thresholds = roc_curve(y_val, lr_tfidf_y_prob)

# # Create ROC curve
# plt.plot(fpr,tpr)
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()

### Naive Bayes with tf-idf

In [30]:
# Define and fit model
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)

# Make a prediction on the validation data
nb_tfidf_y_predict = nb_tfidf.predict(X_val_vectors_tfidf)
nb_tfidf_y_prob = nb_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]

# Encode from topics [a,b,c,d] to numbers [1., 2., 3., 4.]
nb_tfidf_y_predict = encoder.fit_transform(nb_tfidf_y_predict.reshape(-1,1))
nb_tfidf_y_val = encoder.fit_transform(y_val.reshape(-1,1)) # 111111

Analyse the performance our model

In [31]:
print(classification_report(nb_tfidf_y_val, nb_tfidf_y_predict))

              precision    recall  f1-score   support

         0.0       0.41      0.72      0.52       285
         1.0       0.75      0.02      0.03       178
         2.0       0.80      0.30      0.43       242
         3.0       0.61      0.77      0.68       273
         4.0       0.77      0.83      0.80       288
         5.0       0.85      0.47      0.61       245
         6.0       0.83      0.54      0.65       215
         7.0       0.42      0.71      0.53       271

    accuracy                           0.58      1997
   macro avg       0.68      0.55      0.53      1997
weighted avg       0.67      0.58      0.55      1997



In [None]:
# plot_confusion_matrix(y_val, y_predict)
# plot_confusion_matrix(nb_tfidf_y_val, nb_tfidf_y_predict, df_train["subject"].unique())

### Logistic Regression with w2v

In [32]:
# Define and fit model
lr_w2v = LogisticRegression(solver = 'liblinear', C = 10, penalty = 'l2')
lr_w2v.fit(X_train_vectors_w2v, y_train)  #model

# Make a prediction on the validation data
lr_w2v_y_predict = lr_w2v.predict(X_val_vectors_w2v)
lr_w2v_y_prob = lr_w2v.predict_proba(X_val_vectors_w2v)[:,1]

# Encode from topics [a,b,c,d] to numbers [1., 2., 3., 4.]
lr_w2v_y_predict = encoder.fit_transform(lr_w2v_y_predict.reshape(-1,1))
lr_w2v_y_val = encoder.fit_transform(y_val.reshape(-1,1))

Analyse the performance our model

In [33]:
print(classification_report(lr_w2v_y_val, lr_w2v_y_predict))

              precision    recall  f1-score   support

         0.0       0.58      0.52      0.55       285
         1.0       0.74      0.66      0.69       178
         2.0       0.63      0.68      0.65       242
         3.0       0.71      0.81      0.76       273
         4.0       0.84      0.81      0.82       288
         5.0       0.72      0.72      0.72       245
         6.0       0.78      0.76      0.77       215
         7.0       0.60      0.62      0.61       271

    accuracy                           0.70      1997
   macro avg       0.70      0.70      0.70      1997
weighted avg       0.70      0.70      0.70      1997



In [None]:
# lr_w2v_heatmap = plot_confusion_matrix(y_val, y_predict)
# plot_confusion_matrix(y_val, y_predict, df_train["subject"].unique())

With multiple dimensions, plotting a ROC curve might be hard to do. An implementation may come later on.

In [None]:
# # Create ROC curve
# plt.plot(fpr,tpr)
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()

## Testing on unlabelled data

In [37]:
# Converting X_test to vector
X_test = df_test['content'] 
X_test_vec = tfidf_vectorizer.transform(X_test)
X_test_vec_w2v = modelw.transform(X_test)

#### Choose a model:
Run one of the cells to make a prediction with a specific model

In [55]:
# Logistic Regression TF-IDF
df_test['predicted_topic'] = lr_tfidf.predict(X_test_vec)
df_test['predict_prob'] = lr_tfidf.predict_proba(X_test_vec)[:,1]

# plot_confusion_matrix(df_test['topic'], df_test['predicted_topic']) # may come later
df_test.drop(['content', 'predict_prob'], axis=1).head(20)

Unnamed: 0_level_0,id,subject,predicted_topic
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,2301.04063,mathematics,mathematics
9,2212.0889,statistics,statistics
7,2301.0089,statistics,statistics
7,2301.02912,q_finance,mathematics
10,2209.10545,q_biology,q_biology
2,2301.00159,q_biology,q_biology
8,2212.04377,q_biology,q_biology
10,2208.13675,q_biology,q_biology
9,2301.02491,mathematics,mathematics
3,2205.14517,economics,economics


In [56]:
# Naive Bayes TF-IDF
df_test['predicted_topic'] = nb_tfidf.predict(X_test_vec)
df_test['predict_prob'] = nb_tfidf.predict_proba(X_test_vec)[:,1]

# plot_confusion_matrix(df_test['topic'], df_test['predicted_topic']) # may come later
df_test.drop(['content', 'predict_prob'], axis=1).head(20)

Unnamed: 0_level_0,id,subject,predicted_topic
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,2301.04063,mathematics,mathematics
9,2212.0889,statistics,statistics
7,2301.0089,statistics,statistics
7,2301.02912,q_finance,mathematics
10,2209.10545,q_biology,q_biology
2,2301.00159,q_biology,statistics
8,2212.04377,q_biology,computer_science
10,2208.13675,q_biology,computer_science
9,2301.02491,mathematics,mathematics
3,2205.14517,economics,computer_science


In [57]:
# Logistic Regression word2vec
df_test['predicted_topic'] = lr_w2v.predict(X_test_vec_w2v)
df_test['predict_prob'] = lr_w2v.predict_proba(X_test_vec_w2v)[:,1]

# plot_confusion_matrix(df_test['topic'], df_test['predicted_topic']) # may come later
df_test.drop(['content', 'predict_prob'], axis=1).head(20)

Unnamed: 0_level_0,id,subject,predicted_topic
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,2301.04063,mathematics,computer_science
9,2212.0889,statistics,computer_science
7,2301.0089,statistics,computer_science
7,2301.02912,q_finance,computer_science
10,2209.10545,q_biology,computer_science
2,2301.00159,q_biology,computer_science
8,2212.04377,q_biology,computer_science
10,2208.13675,q_biology,computer_science
9,2301.02491,mathematics,computer_science
3,2205.14517,economics,computer_science
