Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Reading filtered data with 103 classes

In [2]:
df = pd.read_csv("data/103_classes_filtered.csv")

In [3]:
X = df['title']
y = df['categories']

In [4]:
X.shape

(381732,)

In [5]:
y[0]

"['cs.NE', 'cs.AI']"

Preprocessing tags, by binarizing using MultiLabelBinarizer

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
y = [literal_eval(i) for i in y]
mlb = MultiLabelBinarizer()
y_binarized = mlb.fit_transform(y)
mlb.classes_

array(['astro-ph.IM', 'cond-mat.dis-nn', 'cond-mat.mtrl-sci',
       'cond-mat.stat-mech', 'cs.AI', 'cs.AR', 'cs.CC', 'cs.CE', 'cs.CG',
       'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', 'cs.DB', 'cs.DC', 'cs.DL',
       'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.GT',
       'cs.HC', 'cs.IR', 'cs.IT', 'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM',
       'cs.MS', 'cs.NA', 'cs.NE', 'cs.NI', 'cs.OH', 'cs.OS', 'cs.PF',
       'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE', 'cs.SI', 'cs.SY',
       'econ.EM', 'econ.GN', 'econ.TH', 'eess.AS', 'eess.IV', 'eess.SP',
       'eess.SY', 'hep-ex', 'math-ph', 'math.AC', 'math.AG', 'math.AP',
       'math.AT', 'math.CA', 'math.CO', 'math.CT', 'math.DG', 'math.DS',
       'math.FA', 'math.GR', 'math.IT', 'math.LO', 'math.MG', 'math.MP',
       'math.NA', 'math.NT', 'math.OC', 'math.PR', 'math.RA', 'math.RT',
       'math.ST', 'nlin.AO', 'nlin.CD', 'nlin.CG', 'physics.ao-ph',
       'physics.app-ph', 'physics.bio-ph', 'physics.chem-ph',
       'physics.comp

Title preprocessing using fastText, to obtain sentence embeddings

In [7]:
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/siddhipotdar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# tokenize words and remove stopwords

stopwords = nltk.corpus.stopwords.words('english')
tokenizer = nltk.WordPunctTokenizer()

def preprocess(text: str) -> str:
    text = tokenizer.tokenize(text.lower().strip())
    text = [ x for x in text if x not in stopwords]
    return ' '.join(text)



In [9]:
X_token = []
for item in X:
    item = re.sub(r'[^a-zA-Z\s]','', item, re.I)
    X_token.append(preprocess(item))
    #print(item)
    

In [10]:
X_token = np.array(X_token)

Uncomment the cell below to install fastText, if not already installed

In [11]:
# !wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
# !unzip v0.9.2.zip
# !cd fastText-0.9.2
# !pip install ./fastText-0.9.2/.


Finding fastttext sentence embeddings

In [12]:
import fasttext
import fasttext.util

#fasttext.util.download_model('en', if_exists='ignore')
model = fasttext.load_model('./fastText-0.9.2/cc.en.300.bin')
    



In [13]:
# make a dictionary, store repeating words from the dataset inside it

embedding_map = dict()
for sentence in X_token:
    for word in sentence.split():
        if word not in embedding_map:
            embedding_map[word] = model.get_word_vector(word)

In [14]:
def get_fasttext_embeddings(x_data, y_data):
    sentence_embedding = []
    sample = 0
    for sentence in x_data:
        word_embedding = np.zeros((300,))
        count = 0
        for word in sentence.split():
            word_embedding+=embedding_map[word]
            count += 1
        if count != 0:
            sentence_embedding.append(word_embedding/count)
        else:
            sentence_embedding.append(word_embedding)
        sample+=1
    return np.array(sentence_embedding), y_data


In [15]:
X_embeddings, y_label = get_fasttext_embeddings(X_token, y_binarized)

In [16]:
len(X_embeddings)
#len(train_label)

381732

In [17]:
len(y_label)

381732

In [18]:
y_label

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Saving obtained sentence embeddings

In [19]:
np.save('./data/X_fasttext_embeddings', X_embeddings)
np.save('./data/y_binarized', y_label)
np.save('./data/y_tags',mlb.classes_)