# Logistische Regression - GonzaloA

In [1]:
import pandas as pd
import numpy as np

# Train/validation split
from sklearn.model_selection import train_test_split

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Classifier and evaluation metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pipeline utilities
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Parameters
CSV_PATH = "../../data/bigFakeNews/dataFiltered.csv"
CHUNK_SIZE = 100_000  # or tune to your memory limit
N_FEATURES = 2**20    # ~1M hashed features; tune higher/lower as needed

# Initialize
vectorizer = HashingVectorizer(
    n_features=N_FEATURES,
    alternate_sign=False,
    ngram_range=(1,2),
    stop_words='english'
)
clf = SGDClassifier(loss='log_loss', max_iter=1, tol=None, warm_start=True)


In [3]:
# First pass: partial fit on labeled chunks
classes = [0, 1]  # replace with your actual label set (e.g. ['Real', 'Fake'] → [0,1])
for chunk in pd.read_csv(CSV_PATH, sep=',', chunksize=CHUNK_SIZE):
    texts = (chunk['title'] + "\n" + chunk['text']).tolist()
    X_chunk = vectorizer.transform(texts)
    y_chunk = chunk['label'].values
    clf.partial_fit(X_chunk, y_chunk, classes=classes)

# (Optional) Second pass: evaluate on a held‑out chunk
test_chunk = next(pd.read_csv(CSV_PATH, sep=',', chunksize=CHUNK_SIZE))
X_test = vectorizer.transform((test_chunk['title'] + "\n" + test_chunk['text']).tolist())
y_test = test_chunk['label'].values
preds = clf.predict(X_test)
print("Chunk Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

Chunk Accuracy: 0.93711
              precision    recall  f1-score   support

           0       0.97      0.93      0.95     64929
           1       0.88      0.95      0.91     35071

    accuracy                           0.94    100000
   macro avg       0.93      0.94      0.93    100000
weighted avg       0.94      0.94      0.94    100000



In [4]:
import joblib

# Save classifier and vectorizer separately
joblib.dump(clf, 'sgd_news_classifier.pkl')
joblib.dump(vectorizer, 'hashing_vectorizer.pkl')

['hashing_vectorizer.pkl']