In [9]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

In [None]:
df = pd.concat([pd.read_parquet(f"data/chunks/chunk_{j}.parquet") for j in (range(1, 20))])

In [4]:
# Turning lists back into strings
df['content'] = df['content'].apply(lambda x: ' '.join(x))

### Simple model

In [5]:
# Creating the vocabulary matrix
vectorizer = CountVectorizer(max_features=10000)
x = vectorizer.fit_transform(df['content'])

In [6]:
# Splitting 80/10/10
x_train, x_test, y_train, y_test = train_test_split(x, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

In [None]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.90      0.87     54736
           1       0.82      0.74      0.78     35487

    accuracy                           0.83     90223
   macro avg       0.83      0.82      0.82     90223
weighted avg       0.83      0.83      0.83     90223

True negatives: 49098
False positives: 5638
False negatives: 9314
True positives: 26173


In [None]:
(tn, fp, fn, tp) = confusion_matrix(y_val, y_pred).ravel()
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print(f'True positives: {tp}')

### Advanced model

Nedenfor har jeg prøvet med flere features. Som jeg nævnte er det næsten samme resultat, og det er gået op for mig hvorfor. 
De ekstra features jeg har prøvet, altså antallet af de forskellige tokens, er i forvejen indeholdt i matrixen, så der er ingen egentlig ekstra information at træne på. D'oh! 

In [11]:
text_features = vectorizer.fit_transform(df['content'])
numerical_features = df[['num_count', 'url_count', 'email_count', 'date_count']]

scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)

x_features = hstack([text_features, scaled_numerical_features])

x_train, x_test, y_train, y_test = train_test_split(x_features, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87     54736
           1       0.82      0.74      0.78     35487

    accuracy                           0.83     90223
   macro avg       0.83      0.82      0.82     90223
weighted avg       0.83      0.83      0.83     90223



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Including the length of the articles and the number of distinct words in the article improves the f1 score by 1%.

In [12]:
text_features = vectorizer.fit_transform(df['content'])
numerical_features = df[['length', 'length_distinct_words']]

scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)

x_features = hstack([text_features, scaled_numerical_features])

x_train, x_test, y_train, y_test = train_test_split(x_features, df['group'], test_size=0.2, random_state=1337)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=1337)

classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87     54736
           1       0.82      0.76      0.79     35487

    accuracy                           0.84     90223
   macro avg       0.83      0.82      0.83     90223
weighted avg       0.84      0.84      0.84     90223

