# Local Outlier Factor (LOF) Hyperparametertuning
## Local Outlier Factor

### TODO Erklärung LOF

# Grid Search

In [4]:
# 1. Imports
import pandas as pd
import numpy as np

# Train/validation split
from sklearn.model_selection import train_test_split

# Text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# One-class classifier and evaluation metrics
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pipeline utilities
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import LocalOutlierFactor
import os

from scipy.stats import randint, uniform
from sklearn.experimental import enable_halving_search_cv  # Needed to enable
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

path = os.path.join("../../../data", "Saurabh Shahane - Fake_News_Classification", "WELFake_Dataset.csv")

df = pd.read_csv(path)
df = df.sample(frac=0.1, random_state=42)
df = df.rename(columns={'Title': 'title', 'Text': 'text', 'Label': 'label'})
df = df[df['text'].notna() & df['title'].notna()]
df['label'] = 1 - df['label'].astype(int)  # Flip labels: 0 → 1, 1 → 0
df = df[['title', 'text', 'label']]

X = df['title'] + '\n' + df['text']
y = df['label']

# 2) Split into train / temp (30%) then val/test (each 15%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

# 3) Build the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    # novelty=True lets LOF be used for prediction on unseen data
    ('lof', LocalOutlierFactor(novelty=True))
])

# 4) Evaluation function
def evaluate_lof_params(params):
    # build vectorizer + LOF (novelty mode)
    vec = TfidfVectorizer(
        max_features=params['tfidf__max_features'],
        ngram_range=params['tfidf__ngram_range'],
        stop_words='english'
    )
    lof = LocalOutlierFactor(
        n_neighbors=params['lof__n_neighbors'],
        contamination=params['lof__contamination'],
        novelty=True
    )
    
    # fit on real‐only training
    Xtr = vec.fit_transform(X_train)            # X_train = real+fake ∪ but LOF novelty uses only X_train
    lof.fit(Xtr.toarray())                      # LOF requires dense when novelty=True
    
    # transform validation
    Xv = vec.transform(X_val)
    raw = lof.predict(Xv.toarray())             # +1=inlier→Real, -1=outlier→Fake
    y_pred = np.where(raw==1, 1, 0)
    
    return accuracy_score(y_val, y_pred)

In [None]:
param_grid = {
    'tfidf__max_features': [20_000, 30_000],
    'lof__n_neighbors'   : [20, 40],
    'lof__contamination' : [0.4, 0.5]
}

In [None]:
best_score  = -1
best_params = None
total = np.prod([len(v) for v in param_grid.values()])
i = 0

for mf in param_grid['tfidf__max_features']:
        for nn in param_grid['lof__n_neighbors']:
            for cont in param_grid['lof__contamination']:
                i += 1
                p = {
                    'tfidf__max_features': mf,
                    'tfidf__ngram_range': (1,1),
                    'lof__n_neighbors': nn,
                    'lof__contamination': cont
                }
                score = evaluate_lof_params(p)
                print(f"[{i}/{total}] {p} → Val acc: {score:.4f}")
                if score > best_score:
                    best_score, best_params = score, p.copy()

print("\n✅ Best validation accuracy:", best_score)
print("📋 Best parameters:", best_params)