In [None]:

import joblib

# Load models
knn_loaded = joblib.load("/content/drive/MyDrive/Colab Notebooks/Stored/KNN/knn_tfidf_model.pkl")
knn_count_loaded = joblib.load("/content/drive/MyDrive/Colab Notebooks/Stored/KNN/knn_count_model.pkl")

# Load vectorizers
tfidf_loaded = joblib.load("/content/drive/MyDrive/Colab Notebooks/Stored/KNN/tfidf_vectorizer.pkl")
count_vec_loaded = joblib.load("/content/drive/MyDrive/Colab Notebooks/Stored/KNN/count_vectorizer.pkl")

# Example prediction with TF-IDF KNN
new_text = ["""The World Health Organization declared COVID-19 a global pandemic on March 11, 2020, after the coronavirus spread rapidly across multiple continents. Governments around the world responded by implementing lockdowns, social distancing, and mask mandates to slow transmission.

By late 2020, pharmaceutical companies including Pfizer-BioNTech and Moderna developed mRNA vaccines, which were granted emergency use authorization in several countries. Large-scale vaccination campaigns began in December 2020, marking a major milestone in the global fight against the pandemic."""]
X_new = tfidf_loaded.transform(new_text)
prediction = knn_loaded.predict(X_new)

print("Prediction:", prediction)  # 0 = Fake, 1 = Real


Prediction: [0]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:




# Load CSV
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")


# Drop rows where 'content' or 'label' is missing
df = df.dropna(subset=["content", "label"])

df.to_csv("ankasa.csv", index=False)



import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract content and labels
texts = df['content'].astype(str)  # ensure text format
labels = df['label']

# Initialize vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # limit features if needed
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Fit and transform data
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

print("TF-IDF Shape:", X_tfidf.shape)
print("Count Vectorizer Shape:", X_count.shape)

# Example: check vocabulary sizes
print("TF-IDF Vocabulary Size:", len(tfidf_vectorizer.vocabulary_))
print("Count Vocabulary Size:", len(count_vectorizer.vocabulary_))



TF-IDF Shape: (9948, 5000)
Count Vectorizer Shape: (9948, 5000)
TF-IDF Vocabulary Size: 5000
Count Vocabulary Size: 5000


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Transform
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# Grid Search for K (TF-IDF)
# ----------------------
param_grid = {'n_neighbors': [5, 10, 20, 50, 100]}
grid_tfidf = GridSearchCV(KNeighborsClassifier(n_jobs=-1), param_grid, cv=3, scoring='accuracy')
grid_tfidf.fit(X_train_tfidf, y_train)
print("Best K (TF-IDF):", grid_tfidf.best_params_)
print("Best CV Accuracy (TF-IDF):", grid_tfidf.best_score_)

# ----------------------
# Final KNN (TF-IDF, K=10)
# ----------------------
knn_tfidf = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
knn_tfidf.fit(X_train_tfidf, y_train)

train_acc_tfidf = accuracy_score(y_train, knn_tfidf.predict(X_train_tfidf))
test_acc_tfidf = accuracy_score(y_test, knn_tfidf.predict(X_test_tfidf))

print("\nKNN with TF-IDF")
print("Training Accuracy:", round(train_acc_tfidf * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_tfidf * 100, 2), "%")

# ----------------------
# Final KNN (Count Vectorizer, K=10)
# ----------------------
knn_count = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
knn_count.fit(X_train_count, y_train)

train_acc_count = accuracy_score(y_train, knn_count.predict(X_train_count))
test_acc_count = accuracy_score(y_test, knn_count.predict(X_test_count))

print("\nKNN with Count Vectorizer")
print("Training Accuracy:", round(train_acc_count * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_count * 100, 2), "%")


Best K (TF-IDF): {'n_neighbors': 5}
Best CV Accuracy (TF-IDF): 0.8051018255891762

KNN with TF-IDF
Training Accuracy: 82.16 %
Testing Accuracy: 80.4 %

KNN with Count Vectorizer
Training Accuracy: 76.35 %
Testing Accuracy: 75.63 %


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Transform
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# SMOTE + KNN Pipeline (TF-IDF)
# ----------------------
smote = SMOTE(random_state=42)

knn_tfidf = KNeighborsClassifier(n_neighbors=10, weights='distance', n_jobs=-1)
pipeline_tfidf = Pipeline([
    ('smote', smote),
    ('knn', knn_tfidf)
])

pipeline_tfidf.fit(X_train_tfidf, y_train)

y_pred_train_tfidf = pipeline_tfidf.predict(X_train_tfidf)
y_pred_test_tfidf = pipeline_tfidf.predict(X_test_tfidf)

print("\nKNN with TF-IDF + SMOTE + distance weighting")
print("Training Accuracy:", round(accuracy_score(y_train, y_pred_train_tfidf) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_pred_test_tfidf) * 100, 2), "%")
print("Testing F1-Score:", round(f1_score(y_test, y_pred_test_tfidf) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_test_tfidf))

# ----------------------
# SMOTE + KNN Pipeline (Count Vectorizer)
# ----------------------
knn_count = KNeighborsClassifier(n_neighbors=10, weights='distance', n_jobs=-1)
pipeline_count = Pipeline([
    ('smote', smote),
    ('knn', knn_count)
])

pipeline_count.fit(X_train_count, y_train)

y_pred_train_count = pipeline_count.predict(X_train_count)
y_pred_test_count = pipeline_count.predict(X_test_count)

print("\nKNN with Count Vectorizer + SMOTE + distance weighting")
print("Training Accuracy:", round(accuracy_score(y_train, y_pred_train_count) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_pred_test_count) * 100, 2), "%")
print("Testing F1-Score:", round(f1_score(y_test, y_pred_test_count) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_test_count))



KNN with TF-IDF + SMOTE + distance weighting
Training Accuracy: 99.96 %
Testing Accuracy: 84.72 %
Testing F1-Score: 76.79 %

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89      1482
           1       0.63      0.99      0.77       508

    accuracy                           0.85      1990
   macro avg       0.81      0.89      0.83      1990
weighted avg       0.90      0.85      0.86      1990


KNN with Count Vectorizer + SMOTE + distance weighting
Training Accuracy: 99.72 %
Testing Accuracy: 85.13 %
Testing F1-Score: 77.16 %

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.81      0.89      1482
           1       0.63      0.98      0.77       508

    accuracy                           0.85      1990
   macro avg       0.81      0.89      0.83      1990
weighted avg       0.90      0.85      0.86      1990



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# ----------------------
# Load dataset
# ----------------------
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")
texts = df['content'].astype(str)
labels = df['label']

# ----------------------
# Vectorizers
# ----------------------
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# ----------------------
# Train-test split (80/20)
# ----------------------
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# GridSearchCV Parameters
# Only use metrics compatible with sparse input
# ----------------------
param_grid = {
    'knn__n_neighbors': [5, 10, 20, 50, 100],
    'knn__metric': ['euclidean', 'manhattan']
}

# ----------------------
# SMOTE + KNN Pipeline Function
# ----------------------
def run_knn_pipeline(X_train, X_test, y_train, y_test, vectorizer_name="TF-IDF"):
    smote = SMOTE(random_state=42)
    knn = KNeighborsClassifier(weights='distance', n_jobs=-1)

    pipeline = Pipeline([
        ('smote', smote),
        ('knn', knn)
    ])

    grid = GridSearchCV(
        pipeline,
        param_grid,
        cv=3,
        scoring='f1',
        n_jobs=-1,
        verbose=2
    )

    grid.fit(X_train, y_train)

    # Evaluation
    y_pred = grid.predict(X_test)
    print(f"\nBest Parameters ({vectorizer_name}): {grid.best_params_}")
    print(f"Best CV F1-Score ({vectorizer_name}): {grid.best_score_:.2f}")
    print(f"{vectorizer_name} Test Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"{vectorizer_name} Test F1-Score: {f1_score(y_test, y_pred):.2f}")
    print(f"\nClassification Report ({vectorizer_name}):\n{classification_report(y_test, y_pred)}")

# ----------------------
# Run TF-IDF KNN
# ----------------------
run_knn_pipeline(X_train_tfidf, X_test_tfidf, y_train, y_test, vectorizer_name="TF-IDF")

# ----------------------
# Run Count Vectorizer KNN
# ----------------------
run_knn_pipeline(X_train_count, X_test_count, y_train, y_test, vectorizer_name="Count")


Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best Parameters (TF-IDF): {'knn__metric': 'euclidean', 'knn__n_neighbors': 5}
Best CV F1-Score (TF-IDF): 0.66
TF-IDF Test Accuracy: 0.86
TF-IDF Test F1-Score: 0.79

Classification Report (TF-IDF):
              precision    recall  f1-score   support

           0       0.99      0.82      0.90      1482
           1       0.65      0.99      0.79       508

    accuracy                           0.86      1990
   macro avg       0.82      0.90      0.84      1990
weighted avg       0.91      0.86      0.87      1990

Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best Parameters (Count): {'knn__metric': 'manhattan', 'knn__n_neighbors': 5}
Best CV F1-Score (Count): 0.62
Count Test Accuracy: 0.86
Count Test F1-Score: 0.79

Classification Report (Count):
              precision    recall  f1-score   support

           0       0.99      0.82      0.90      1482
           1       0.66      0.98      0.79       5

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# ----------------------
# Load dataset
# ----------------------
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")
texts = df['content'].astype(str)
labels = df['label']

# ----------------------
# TF-IDF Vectorization
# ----------------------
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(texts)

# ----------------------
# Train-test split (80/20)
# ----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# SMOTE + Random Forest Pipeline
# ----------------------
smote = SMOTE(random_state=42)
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

pipeline = Pipeline([
    ('smote', smote),
    ('rf', rf)
])

# ----------------------
# GridSearchCV Hyperparameters
# ----------------------
param_grid = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

# ----------------------
# Fit the model
# ----------------------
grid.fit(X_train, y_train)

# ----------------------
# Evaluation
# ----------------------
y_pred = grid.predict(X_test)
print("Best Parameters (Random Forest):", grid.best_params_)
print("Best CV F1-Score:", round(grid.best_score_ * 100, 2), "%")
print("Test Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("Test F1-Score:", round(f1_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Parameters (Random Forest): {'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 500}
Best CV F1-Score: 74.24 %
Test Accuracy: 91.46 %
Test F1-Score: 81.8 %

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      1482
           1       0.90      0.75      0.82       508

    accuracy                           0.91      1990
   macro avg       0.91      0.86      0.88      1990
weighted avg       0.91      0.91      0.91      1990



In [6]:
import os
import joblib

# ----------------------
# Define checkpoint folder
# ----------------------
CHECKPOINT_DIR = "/content/drive/MyDrive/Colab Notebooks/Checkpoints/RF"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# ----------------------
# Save the trained model
# ----------------------
model_path = os.path.join(CHECKPOINT_DIR, "rf_smote_tfidf.pkl")
joblib.dump(grid.best_estimator_, model_path)

print(f"Model saved at: {model_path}")


Model saved at: /content/drive/MyDrive/Colab Notebooks/Checkpoints/RF/rf_smote_tfidf.pkl


In [9]:
import joblib
from sklearn.pipeline import Pipeline

# After training RF with SMOTE
# Assume `grid.best_estimator_` is your SMOTE + RF pipeline

# Extract the trained Random Forest
trained_rf = grid.best_estimator_['rf']  # King, this is the RF classifier after SMOTE training

# Save TF-IDF vectorizer separately
tfidf_vectorizer = grid.best_estimator_['smote'].fit_resample  # Actually keep your original TF-IDF
# But safer: just save the vectorizer you already used
tfidf_vectorizer_path = "/content/drive/MyDrive/Colab Notebooks/Checkpoints/RF/tfidf_vectorizer.pkl"
joblib.dump(tfidf_vectorizer, tfidf_vectorizer_path)

# Create a clean inference pipeline: TF-IDF + trained RF
inference_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('rf', trained_rf)
])

# Save the inference pipeline
inference_model_path = "/content/drive/MyDrive/Colab Notebooks/Checkpoints/RF/rf_inference.pkl"
joblib.dump(inference_pipeline, inference_model_path)

print("Inference pipeline saved at:", inference_model_path)


Inference pipeline saved at: /content/drive/MyDrive/Colab Notebooks/Checkpoints/RF/rf_inference.pkl


In [11]:
import joblib
from sklearn.pipeline import Pipeline

# Paths
CHECKPOINT_DIR = "/content/drive/MyDrive/Colab Notebooks/Checkpoints/RF"
tfidf_vectorizer_path = f"{CHECKPOINT_DIR}/tfidf_vectorizer.pkl"  # Your fitted TF-IDF
inference_model_path = f"{CHECKPOINT_DIR}/rf_inference.pkl"

# Load the TF-IDF vectorizer
tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)

# Extract the trained RF classifier from your SMOTE pipeline
trained_rf = grid.best_estimator_['rf']  # The Random Forest classifier

# Build a proper inference pipeline
inference_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('rf', trained_rf)
])

# Save the inference pipeline
joblib.dump(inference_pipeline, inference_model_path)
print("Inference pipeline saved at:", inference_model_path)

# ----------------------
# Predict
# ----------------------
model = joblib.load(inference_model_path)
new_text = ["Coronavirus is a hoax"]
prediction = model.predict(new_text)
prediction_proba = model.predict_proba(new_text)

print("Predicted label:", prediction[0])
print("Prediction probabilities:", prediction_proba[0])


Inference pipeline saved at: /content/drive/MyDrive/Colab Notebooks/Checkpoints/RF/rf_inference.pkl


AttributeError: 'function' object has no attribute 'transform'