<h1>Article Recommendation Engine</h1>

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
import random
# from goose import Goose # For Python 2
from goose3 import Goose # For Python 3

DATA_DIR = "data"

In [2]:
# Get generator of articles from a list of urls
def get_articles_from_list_of_urls(list_of_urls):
    for url in list_of_urls:
        g = Goose()
        article = g.extract(url=url)
        yield article.cleaned_text

In [3]:
filepath = DATA_DIR + "/list_of_urls.txt"

with open(filepath, 'r') as f:
    list_of_urls = f.read().split('\n')

# Get a random subset of the urls
list_of_urls = random.choices(list_of_urls, k=20)

# Randomly assign labels
percent_true = 0.33
num_true = int((percent_true * len(list_of_urls)))
num_false = len(list_of_urls) - num_true
labels = [1] * num_true + [0] * num_false
random.shuffle(labels)

In [4]:
# Perform a train-test split
urls_train, urls_test, y_train, y_test = train_test_split(list_of_urls, labels, test_size=0.33, random_state=42)

<h2>Method 1: TF-IDF</h2>

In [5]:
articles_train = get_articles_from_list_of_urls(urls_train)
articles_test = get_articles_from_list_of_urls(urls_test)

In [6]:
tfidf_vectorizer = TfidfVectorizer(binary=True, norm=None, use_idf=False)
tfidf_lr = SGDClassifier(loss="log", penalty="l2")

In [7]:
X_train = tfidf_vectorizer.fit_transform(articles_train)
len(tfidf_vectorizer.get_feature_names())

1944

In [8]:
%%time
tfidf_lr.fit(X_train, y_train)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.04 ms




SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [9]:
X_test = tfidf_vectorizer.transform(articles_test)
tfidf_pred_probs = tfidf_lr.predict_proba(X_test)
tfidf_auc = roc_auc_score(y_test, tfidf_pred_probs[:, 1])

  np.exp(prob, prob)


In [10]:
print("tfidf_auc:", tfidf_auc)

tfidf_auc: 0.4166666666666667


<h2>Method 2: Feature Hashing</h2>

In [11]:
articles_train = get_articles_from_list_of_urls(urls_train)
articles_test = get_articles_from_list_of_urls(urls_test)

In [12]:
hashing_vectorizer = HashingVectorizer(n_features=100, binary=True, norm=None)
hashing_lr = SGDClassifier(loss="log", penalty="l2")

In [13]:
X_train = hashing_vectorizer.transform(articles_train)

In [14]:
%%time
hashing_lr.fit(X_train, y_train)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.17 ms




SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [15]:
X_test = hashing_vectorizer.transform(articles_test)
hashing_pred_probs = hashing_lr.predict_proba(X_test)
hashing_auc = roc_auc_score(y_test, hashing_pred_probs[:, 1])

In [16]:
print("hashing_auc:", hashing_auc)

hashing_auc: 0.8333333333333334
