In [91]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import scipy.sparse
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('omw-1.4')

FILE_NAME = "covid19_articles_20201231.csv"


[nltk_data] Downloading package omw-1.4 to /Users/sherman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [44]:
df = pd.read_csv("data/" + FILE_NAME)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]


In [45]:
## REMOVE THIS LINE FOR PROD ##
## USED FOR TESTING TO MAKE THINGS FASTER ##
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, stratify=y)
X = X_test[["content"]]
y = y_test
X_train, X_test, y_train, y_test = None, None, None, None
## END ##


In [46]:
def clean_text(text, stemm=False, lemm=False, stopwords=None):
    text = re.sub(r"[^a-z\s]", "", text.lower().strip())
    if stopwords:
        text = [x for x in text.split() if x not in stopwords]

    if stemm:
        stemmer = PorterStemmer()
        text = [stemmer.stem(x) for x in text]

    if lemm:
        lemmatizer = WordNetLemmatizer()
        text = [lemmatizer.lemmatize(x) for x in text]

    return " ".join(text)


STOP_WORDS = set(nltk.corpus.stopwords.words("english"))


In [49]:
X["clean"] = X["content"].apply(
    lambda x: clean_text(x, lemm=True, stopwords=STOP_WORDS)
)


# TFIDF Vectors

In [73]:
tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    max_features=10000, ngram_range=(1, 2)
)

tfidf_corpus = X["clean"]
tfidf_vectors = tfidf_vectorizer.fit_transform(tfidf_corpus)
tfidf_vocab = tfidf_vectorizer.vocabulary_

print(tfidf_vectors.shape)

(18453, 10000)


In [74]:
## reduce dimensionality by using Chi-Square test
## generate most significant words
tfidf_X_names = tfidf_vectorizer.get_feature_names_out()
p_value_limit = 0.95

_ = pd.DataFrame()
for category in np.unique(y):
    chi_square, p = chi2(tfidf_vectors, y == category)
    _ = pd.concat(
        [_, pd.DataFrame({"feature": tfidf_X_names, "score": 1 - p, "y": category})]
    )
    _ = _.sort_values(["y", "score"], ascending=[True, False])
    _ = _[_["score"] > p_value_limit]

tfidf_X_names = _["feature"].unique().tolist()


In [90]:
## regenerate vectors
tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    vocabulary=tfidf_X_names
)

tfidf_corpus = X["clean"]
tfidf_vectors = tfidf_vectorizer.fit_transform(tfidf_corpus)
tfidf_vocab = tfidf_vectorizer.vocabulary_

print(tfidf_vectors.shape)


(18453, 2442)


In [93]:
## save it for future use
scipy.sparse.save_npz("processed_data/tfidf_sparse_matrix", tfidf_vectors)
tfidf_vectors = scipy.sparse.load_npz("processed_data/tfidf_sparse_matrix.npz")
tfidf_vectors.shape
