In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train/test split
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# -------------------------------
# Step 1: Grid Search for TF-IDF
# -------------------------------
param_grid = {
    'C': [0.1, 1, 5, 10],
    'gamma': [0.1, 0.01, 0.007],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
grid_tfidf = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_tfidf.fit(X_train_tfidf, y_train)

print("Best parameters (TF-IDF):", grid_tfidf.best_params_)

# Best was RBF with C=10, gamma=0.1 but overfit
# Reduce gamma to 0.01 for better generalization
svm_tfidf = SVC(kernel='rbf', C=10, gamma=0.01)
svm_tfidf.fit(X_train_tfidf, y_train)

train_acc_tfidf = accuracy_score(y_train, svm_tfidf.predict(X_train_tfidf))
test_acc_tfidf = accuracy_score(y_test, svm_tfidf.predict(X_test_tfidf))

print("\nSVM with TF-IDF (final tuned)")
print("Training Accuracy:", round(train_acc_tfidf * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_tfidf * 100, 2), "%")

# -------------------------------
# Step 2: Grid Search for Count Vectorizer
# -------------------------------
grid_count = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_count.fit(X_train_count, y_train)

print("\nBest parameters (Count):", grid_count.best_params_)

# Best also gave overfitting → reduce gamma and C
svm_count = SVC(kernel='rbf', C=5, gamma=0.007)
svm_count.fit(X_train_count, y_train)

train_acc_count = accuracy_score(y_train, svm_count.predict(X_train_count))
test_acc_count = accuracy_score(y_test, svm_count.predict(X_test_count))

print("\nSVM with Count Vectorizer (final tuned)")
print("Training Accuracy:", round(train_acc_count * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_count * 100, 2), "%")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters (TF-IDF): {'C': 10, 'gamma': 0.1, 'kernel': 'linear'}

SVM with TF-IDF (final tuned)
Training Accuracy: 82.04 %
Testing Accuracy: 80.0 %
Fitting 5 folds for each of 48 candidates, totalling 240 fits

Best parameters (Count): {'C': 5, 'gamma': 0.1, 'kernel': 'rbf'}

SVM with Count Vectorizer (final tuned)
Training Accuracy: 90.7 %
Testing Accuracy: 86.08 %


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# -------------------------------
# Load dataset
# -------------------------------
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")
texts = df['content'].astype(str)
labels = df['label']

# -------------------------------
# Vectorizers
# -------------------------------
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# -------------------------------
# Train/test split
# -------------------------------
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# -------------------------------
# Grid search parameters
# -------------------------------
param_grid = {
    'C': [0.1, 1, 5, 10],
    'gamma': [0.1, 0.01, 0.007],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'class_weight': ['balanced']  # automatically handle class imbalance
}

# -------------------------------
# Step 1: Grid Search for TF-IDF
# -------------------------------
grid_tfidf = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_tfidf.fit(X_train_tfidf, y_train)
print("Best parameters (TF-IDF):", grid_tfidf.best_params_)

# Train final model using best params
best_params_tfidf = grid_tfidf.best_params_
svm_tfidf = SVC(
    kernel=best_params_tfidf['kernel'],
    C=best_params_tfidf['C'],
    gamma=best_params_tfidf['gamma'],
    class_weight='balanced'
)
svm_tfidf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)
print("\nSVM with TF-IDF (final tuned)")
print("Training Accuracy:", round(accuracy_score(y_train, svm_tfidf.predict(X_train_tfidf)) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_pred_tfidf) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_tfidf))

# -------------------------------
# Step 2: Grid Search for Count Vectorizer
# -------------------------------
grid_count = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_count.fit(X_train_count, y_train)
print("\nBest parameters (Count Vectorizer):", grid_count.best_params_)

best_params_count = grid_count.best_params_
svm_count = SVC(
    kernel=best_params_count['kernel'],
    C=best_params_count['C'],
    gamma=best_params_count['gamma'],
    class_weight='balanced'
)
svm_count.fit(X_train_count, y_train)

# Evaluate
y_pred_count = svm_count.predict(X_test_count)
print("\nSVM with Count Vectorizer (final tuned)")
print("Training Accuracy:", round(accuracy_score(y_train, svm_count.predict(X_train_count)) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_pred_count) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_count))


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters (TF-IDF): {'C': 10, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'linear'}

SVM with TF-IDF (final tuned)
Training Accuracy: 99.37 %
Testing Accuracy: 91.01 %

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94      1482
           1       0.85      0.79      0.82       508

    accuracy                           0.91      1990
   macro avg       0.89      0.87      0.88      1990
weighted avg       0.91      0.91      0.91      1990

Fitting 5 folds for each of 48 candidates, totalling 240 fits

Best parameters (Count Vectorizer): {'C': 5, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'rbf'}

SVM with Count Vectorizer (final tuned)
Training Accuracy: 99.86 %
Testing Accuracy: 91.61 %

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      1482
         

In [3]:
import os
import joblib  # for saving sklearn models

# Define checkpoint dir
CHECKPOINT_DIR = "/content/drive/MyDrive/Colab Notebooks/Checkpoints/SVM"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Save both models
tfidf_model_path = os.path.join(CHECKPOINT_DIR, "svm_tfidf.pkl")
count_model_path = os.path.join(CHECKPOINT_DIR, "svm_count.pkl")
tfidf_vectorizer_path = os.path.join(CHECKPOINT_DIR, "tfidf_vectorizer.pkl")
count_vectorizer_path = os.path.join(CHECKPOINT_DIR, "count_vectorizer.pkl")

joblib.dump(svm_tfidf, tfidf_model_path)
joblib.dump(svm_count, count_model_path)
joblib.dump(tfidf_vectorizer, tfidf_vectorizer_path)
joblib.dump(count_vectorizer, count_vectorizer_path)

print(f"SVM models and vectorizers saved to {CHECKPOINT_DIR}")


NameError: name 'svm_tfidf' is not defined

In [8]:
# Reload models
svm_tfidf = joblib.load(tfidf_model_path)
svm_count = joblib.load(count_model_path)
tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
count_vectorizer = joblib.load(count_vectorizer_path)

# Example prediction
sample_text = ["people italy died coronavirus 24hour period march thousand"]
X_tfidf_sample = tfidf_vectorizer.transform(sample_text)
prediction = svm_tfidf.predict(X_tfidf_sample)[0]

print("Prediction:", "Fake" if prediction == 0 else "Real")


Prediction: Fake


In [9]:
import os
import joblib

# Create the folder if it doesn't exist
model_dir = '/content/drive/MyDrive/Colab Notebooks/Checkpoints/SVM'
os.makedirs(model_dir, exist_ok=True)  # exist_ok=True prevents error if folder already exists

# Save TF-IDF SVM
joblib.dump(svm_tfidf, os.path.join(model_dir, 'svm_improved_tfidf.pkl'))

# Save Count Vectorizer SVM
joblib.dump(svm_count, os.path.join(model_dir, 'svm_improved_count.pkl'))

# Save the vectorizers too
joblib.dump(tfidf_vectorizer, os.path.join(model_dir, 'tfidf_improved_vectorizer.pkl'))
joblib.dump(count_vectorizer, os.path.join(model_dir, 'count_improved_vectorizer.pkl'))


['/content/drive/MyDrive/Colab Notebooks/Checkpoints/SVM/count_improved_vectorizer.pkl']