In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,3))  # tri-gram
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Transform
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# Logistic Regression with TF-IDF
# ----------------------
logreg_tfidf = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=1000)
logreg_tfidf.fit(X_train_tfidf, y_train)

train_acc_tfidf = accuracy_score(y_train, logreg_tfidf.predict(X_train_tfidf))
test_acc_tfidf = accuracy_score(y_test, logreg_tfidf.predict(X_test_tfidf))

print("Logistic Regression with TF-IDF")
print("Training Accuracy:", round(train_acc_tfidf * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_tfidf * 100, 2), "%")

# ----------------------
# Logistic Regression with Count Vectorizer
# ----------------------
logreg_count = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=1000)
logreg_count.fit(X_train_count, y_train)

train_acc_count = accuracy_score(y_train, logreg_count.predict(X_train_count))
test_acc_count = accuracy_score(y_test, logreg_count.predict(X_test_count))

print("\nLogistic Regression with Count Vectorizer")
print("Training Accuracy:", round(train_acc_count * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_count * 100, 2), "%")


Logistic Regression with TF-IDF
Training Accuracy: 88.88 %
Testing Accuracy: 84.27 %

Logistic Regression with Count Vectorizer
Training Accuracy: 96.32 %
Testing Accuracy: 89.6 %


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,3))  # tri-gram
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Transform
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# Logistic Regression with TF-IDF (weighted)
# ----------------------
logreg_tfidf = LogisticRegression(
    C=1.0, penalty='l2', solver='liblinear', max_iter=1000, class_weight='balanced'
)
logreg_tfidf.fit(X_train_tfidf, y_train)

y_train_pred_tfidf = logreg_tfidf.predict(X_train_tfidf)
y_test_pred_tfidf = logreg_tfidf.predict(X_test_tfidf)

print("Logistic Regression with TF-IDF (class_weight='balanced')")
print("Training Accuracy:", round(accuracy_score(y_train, y_train_pred_tfidf) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_test_pred_tfidf) * 100, 2), "%")
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_test_pred_tfidf))

# ----------------------
# Logistic Regression with Count Vectorizer (weighted)
# ----------------------
logreg_count = LogisticRegression(
    C=1.0, penalty='l2', solver='liblinear', max_iter=1000, class_weight='balanced'
)
logreg_count.fit(X_train_count, y_train)

y_train_pred_count = logreg_count.predict(X_train_count)
y_test_pred_count = logreg_count.predict(X_test_count)

print("\nLogistic Regression with Count Vectorizer (class_weight='balanced')")
print("Training Accuracy:", round(accuracy_score(y_train, y_train_pred_count) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_test_pred_count) * 100, 2), "%")
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_test_pred_count))


Logistic Regression with TF-IDF (class_weight='balanced')
Training Accuracy: 91.82 %
Testing Accuracy: 85.73 %

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.94      0.87      0.90      1482
           1       0.68      0.83      0.75       508

    accuracy                           0.86      1990
   macro avg       0.81      0.85      0.82      1990
weighted avg       0.87      0.86      0.86      1990


Logistic Regression with Count Vectorizer (class_weight='balanced')
Training Accuracy: 96.95 %
Testing Accuracy: 89.25 %

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      1482
           1       0.78      0.81      0.79       508

    accuracy                           0.89      1990
   macro avg       0.86      0.87      0.86      1990
weighted avg       0.89      0.89      0.89      1990



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,3))  # tri-gram
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Transform
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# Oversample minority class
# ----------------------
ros = RandomOverSampler(random_state=42)
X_train_tfidf_res, y_train_tfidf_res = ros.fit_resample(X_train_tfidf, y_train)
X_train_count_res, y_train_count_res = ros.fit_resample(X_train_count, y_train)

# ----------------------
# Logistic Regression with TF-IDF (oversampled)
# ----------------------
logreg_tfidf = LogisticRegression(
    C=1.0, penalty='l2', solver='liblinear', max_iter=1000
)
logreg_tfidf.fit(X_train_tfidf_res, y_train_tfidf_res)

y_train_pred_tfidf = logreg_tfidf.predict(X_train_tfidf_res)
y_test_pred_tfidf = logreg_tfidf.predict(X_test_tfidf)

print("Logistic Regression with TF-IDF (Oversampled)")
print("Training Accuracy:", round(accuracy_score(y_train_tfidf_res, y_train_pred_tfidf) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_test_pred_tfidf) * 100, 2), "%")
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_test_pred_tfidf))

# ----------------------
# Logistic Regression with Count Vectorizer (oversampled)
# ----------------------
logreg_count = LogisticRegression(
    C=1.0, penalty='l2', solver='liblinear', max_iter=1000
)
logreg_count.fit(X_train_count_res, y_train_count_res)

y_train_pred_count = logreg_count.predict(X_train_count_res)
y_test_pred_count = logreg_count.predict(X_test_count)

print("\nLogistic Regression with Count Vectorizer (Oversampled)")
print("Training Accuracy:", round(accuracy_score(y_train_count_res, y_train_pred_count) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_test_pred_count) * 100, 2), "%")
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_test_pred_count))


Logistic Regression with TF-IDF (Oversampled)
Training Accuracy: 93.57 %
Testing Accuracy: 86.43 %

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.94      0.88      0.91      1482
           1       0.70      0.82      0.76       508

    accuracy                           0.86      1990
   macro avg       0.82      0.85      0.83      1990
weighted avg       0.88      0.86      0.87      1990


Logistic Regression with Count Vectorizer (Oversampled)
Training Accuracy: 97.66 %
Testing Accuracy: 90.2 %

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      1482
           1       0.81      0.80      0.81       508

    accuracy                           0.90      1990
   macro avg       0.87      0.87      0.87      1990
weighted avg       0.90      0.90      0.90      1990



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Count Vectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_count, X_test_count, y_train, y_test = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# Oversample minority class
# ----------------------
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_count, y_train)

# ----------------------
# Logistic Regression (class_weight + oversampled)
# ----------------------
logreg = LogisticRegression(
    C=1.0, penalty='l2', solver='liblinear', max_iter=1000, class_weight='balanced'
)
logreg.fit(X_train_res, y_train_res)

# Predictions
y_train_pred = logreg.predict(X_train_res)
y_test_pred = logreg.predict(X_test_count)

# ----------------------
# Evaluation
# ----------------------
print("Logistic Regression with Count Vectorizer + Oversampling + Class Weights")
print("Training Accuracy:", round(accuracy_score(y_train_res, y_train_pred) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_test_pred) * 100, 2), "%")
print("\nClassification Report (Test Data):")
print(classification_report(y_test, y_test_pred))


Logistic Regression with Count Vectorizer + Oversampling + Class Weights
Training Accuracy: 97.66 %
Testing Accuracy: 90.2 %

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      1482
           1       0.81      0.80      0.81       508

    accuracy                           0.90      1990
   macro avg       0.87      0.87      0.87      1990
weighted avg       0.90      0.90      0.90      1990



In [5]:
import joblib
import os

# Define folder path
checkpoint_folder = "/content/drive/MyDrive/Colab Notebooks/Checkpoints/LogReg_Count"

# Create folder if it doesn't exist
os.makedirs(checkpoint_folder, exist_ok=True)

# Save the trained Logistic Regression model
joblib.dump(logreg, os.path.join(checkpoint_folder, "logreg_model.pkl"))

# Save the Count Vectorizer
joblib.dump(count_vectorizer, os.path.join(checkpoint_folder, "count_vectorizer.pkl"))

print(f"Model and vectorizer saved in {checkpoint_folder}")


Model and vectorizer saved in /content/drive/MyDrive/Colab Notebooks/Checkpoints/LogReg_Count
