In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
from nltk.tokenize import word_tokenize

In [2]:
df = pd.concat([pd.read_parquet(f"data/chunks/chunk_{j}.parquet") for j in (range(1, 21))])

In [3]:
df['meta_keywords'] = df['meta_keywords'].fillna(" ")  
df['meta_keywords'] = df['meta_keywords'].apply(word_tokenize)

In [4]:
# Turning lists back into strings
df['content'] = df['content'].apply(lambda x: ' '.join(x))
df['title'] = df['title'].apply(lambda x: ' '.join(x))
df['meta_keywords'] = df['meta_keywords'].apply(lambda x: ' '.join(x))

### Simple model

Just the content column:

In [None]:
# Creating the vocabulary matrix
vectorizer = CountVectorizer(max_features=10000)
x = vectorizer.fit_transform(df['content'])

In [None]:
# Splitting 80/10/10
x_train, x_test, y_train, y_test = train_test_split(x, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

In [None]:
classifier = LogisticRegression(max_iter=5000)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

(tn, fp, fn, tp) = confusion_matrix(y_val, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')


KeyboardInterrupt: 

With additional features:

In [None]:
content = vectorizer.fit_transform(df['content'])
title = vectorizer.fit_transform(df['title'])
meta_keywords = vectorizer.fit_transform(df['meta_keywords'])


In [None]:
x_features = hstack([title, content, meta_keywords])

x_train, x_test, y_train, y_test = train_test_split(x_features, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

(tn, fp, fn, tp) = confusion_matrix(y_val, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')


              precision    recall  f1-score   support

           0       0.88      0.79      0.84     57439
           1       0.73      0.84      0.78     37283

    accuracy                           0.81     94722
   macro avg       0.80      0.82      0.81     94722
weighted avg       0.82      0.81      0.81     94722

True negatives: 45628
False positives: 11811
False negatives: 6113
True positives: 31170


Including the length of the articles and the number of distinct words in the article improves the f1 score by 1%.

### Advanced Model

In [None]:
classifier = MultinomialNB()
classifier.fit(x_train, y_train)

print(classifier.n_features_in_)

y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

(tn, fp, fn, tp) = confusion_matrix(y_val, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')

20000
              precision    recall  f1-score   support

           0       0.87      0.79      0.83     57439
           1       0.71      0.82      0.76     37283

    accuracy                           0.80     94722
   macro avg       0.79      0.80      0.80     94722
weighted avg       0.81      0.80      0.80     94722

True negatives: 45148
False positives: 12291
False negatives: 6635
True positives: 30648
