In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
from nltk.tokenize import word_tokenize

In [2]:
df = pd.concat([pd.read_parquet(f"data/chunks/chunk_{j}.parquet") for j in (range(1, 21))])

In [3]:
domain_df = pd.read_csv('995,000_rows.csv', usecols=['domain'])
def f1(x):
    x = str(x)
    a = x.split('.')
    return a[0]
df['domain'] = domain_df['domain'].apply(lambda x: f1(x))
del domain_df


In [4]:
# Turning lists back into strings
df['content'] = df['content'].apply(lambda x: ' '.join(x))
df['title'] = df['title'].apply(lambda x: ' '.join(x))
df['meta_keywords'] = df['meta_keywords'].apply(lambda x: ' '.join(x))
df['authors'] = df['authors'].apply(lambda x: ' '.join(x))

### Simple model

Just the content column:

In [5]:
# Creating the vocabulary matrix
vectorizer = CountVectorizer(max_features=10000)
x = vectorizer.fit_transform(df['content'])

In [6]:
# Splitting 80/10/10
x_train, x_test, y_train, y_test = train_test_split(x, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

In [7]:
classifier = LogisticRegression(max_iter=5000)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

(tn, fp, fn, tp) = confusion_matrix(y_val, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')


              precision    recall  f1-score   support

           0       0.84      0.91      0.87     57439
           1       0.83      0.74      0.78     37283

    accuracy                           0.84     94722
   macro avg       0.84      0.82      0.83     94722
weighted avg       0.84      0.84      0.84     94722

True negatives: 51984
False positives: 5455
False negatives: 9797
True positives: 27486


With additional features:

In [8]:
content = vectorizer.fit_transform(df['content'])
title = vectorizer.fit_transform(df['title'])
meta_keywords = vectorizer.fit_transform(df['meta_keywords'])
authors = vectorizer.fit_transform(df['authors'])

In [9]:
x_features = hstack([title, content, meta_keywords, authors])

x_train, x_test, y_train, y_test = train_test_split(x_features, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

(tn, fp, fn, tp) = confusion_matrix(y_val, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')


              precision    recall  f1-score   support

           0       0.96      0.96      0.96     57439
           1       0.94      0.94      0.94     37283

    accuracy                           0.95     94722
   macro avg       0.95      0.95      0.95     94722
weighted avg       0.95      0.95      0.95     94722

True negatives: 55033
False positives: 2406
False negatives: 2267
True positives: 35016


With domains (cheating!):

In [10]:
domain = vectorizer.fit_transform(df['domain'])

x_features = hstack([title, content, meta_keywords, authors, domain])

x_train, x_test, y_train, y_test = train_test_split(x_features, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)

print(classification_report(y_val, y_pred))
(tn, fp, fn, tp) = confusion_matrix(y_val, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     57439
           1       1.00      1.00      1.00     37283

    accuracy                           1.00     94722
   macro avg       1.00      1.00      1.00     94722
weighted avg       1.00      1.00      1.00     94722

True negatives: 57417
False positives: 22
False negatives: 23
True positives: 37260


### Advanced Model

In [11]:
reliable_count, unreliable_count = y_train.value_counts()
total_count = reliable_count + unreliable_count
reliable_weight = total_count / reliable_count
unreliable_weight = total_count / unreliable_count

The training set is roughly 60% reliable articles.

In [17]:
x_features = hstack([content])

x_train, x_test, y_train, y_test = train_test_split(x_features, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

classifier = MultinomialNB(class_prior=[reliable_weight, unreliable_weight], alpha=0.01)
classifier.fit(x_train, y_train)


y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

(tn, fp, fn, tp) = confusion_matrix(y_val, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')

              precision    recall  f1-score   support

           0       0.86      0.76      0.81     57439
           1       0.69      0.81      0.74     37283

    accuracy                           0.78     94722
   macro avg       0.78      0.79      0.78     94722
weighted avg       0.79      0.78      0.78     94722

True negatives: 43920
False positives: 13519
False negatives: 7156
True positives: 30127


In [None]:
x_features = hstack([title, content, meta_keywords, authors])

x_train, x_test, y_train, y_test = train_test_split(x_features, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

classifier = MultinomialNB(class_prior=[reliable_weight, unreliable_weight], alpha=0.01)
classifier.fit(x_train, y_train)


y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

(tn, fp, fn, tp) = confusion_matrix(y_val, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')

              precision    recall  f1-score   support

           0       0.93      0.86      0.90     57439
           1       0.81      0.90      0.85     37283

    accuracy                           0.88     94722
   macro avg       0.87      0.88      0.87     94722
weighted avg       0.88      0.88      0.88     94722

True negatives: 49592
False positives: 7847
False negatives: 3660
True positives: 33623
