In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string
import os
import zipfile
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from gensim.models import Word2Vec

In [2]:
# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Load SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

In [4]:
# Initialize necessary tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [5]:
# Function to clean raw text data
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
        text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
        text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove numbers
        return text
    return ""

In [6]:
# Function for tokenization, stopword removal, and lemmatization
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [7]:
# Function for sentiment analysis
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity

In [8]:
# Function for word embedding using Word2Vec
def train_word2vec(corpus):
    tokenized_corpus = [word_tokenize(doc) for doc in corpus]
    model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
    return model

In [9]:
# GitHub raw CSV URLs
github_csv_urls = [
    "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/More_newsdata_io_articles.csv",
    "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/nes_api_data.csv",
    "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/news_data%20(1).csv",
    "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/news_data%20(2).csv",
    "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/newsdata_io_articles.csv"
]

all_dataframes = []

for url in github_csv_urls:
    try:
        # Read each CSV (comma-delimited by default)
        df_temp = pd.read_csv(url)
        all_dataframes.append(df_temp)
        print(f"Successfully read {url}, shape: {df_temp.shape}")
    except Exception as e:
        print(f"Error loading {url}: {e}")

if all_dataframes:
    # Concatenate all DataFrames
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    print(f"Combined DataFrame shape: {combined_df.shape}")

    # Inspect columns
    print("Columns in combined DataFrame:", combined_df.columns.tolist())

    # Save to a local CSV file
    combined_csv_filename = "combined_data.csv"
    combined_df.to_csv(combined_csv_filename, index=False)
    print(f"All CSVs combined into {combined_csv_filename}!")
else:
    print("No DataFrames were loaded.")


Successfully read https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/More_newsdata_io_articles.csv, shape: (10, 25)
Successfully read https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/nes_api_data.csv, shape: (94, 8)
Successfully read https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/news_data%20(1).csv, shape: (19, 6)
Successfully read https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/news_data%20(2).csv, shape: (44, 6)
Successfully read https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Real%20Time%20Dataset/newsdata_io_articles.csv, shape: (10, 25)
Combined DataFrame shape: (177, 31)
Columns in combined DataFrame: ['article_id', 'title', 'link', 'keywords', 'creator', 'video_url', 'descr

In [10]:
# https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Fake News Real_time Data/politifact_fact_checks.csv
# https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Fake News Real_time Data/politifact_fake_news.csv

In [11]:
import pandas as pd

# GitHub raw CSV URLs for fake datasets
fake_dataset_urls = [
    "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Fake%20News%20Real_time%20Data/politifact_fact_checks.csv",
    "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Fake%20News%20Real_time%20Data/politifact_fake_news.csv"
]

all_fake_dfs = []

for url in fake_dataset_urls:
    try:
        # Read each CSV (using the default comma delimiter)
        df_temp = pd.read_csv(url)
        all_fake_dfs.append(df_temp)
        print(f"Successfully read {url}, shape: {df_temp.shape}")
    except Exception as e:
        print(f"Error loading {url}: {e}")

if all_fake_dfs:
    # Concatenate the fake datasets into one DataFrame
    combined_fake_df = pd.concat(all_fake_dfs, ignore_index=True)
    print(f"Combined Fake DataFrame shape: {combined_fake_df.shape}")

    # Inspect columns if needed
    print("Columns in combined Fake DataFrame:", combined_fake_df.columns.tolist())

    # Save the combined DataFrame to a CSV file
    combined_fake_filename = "combined_fake_data.csv"
    combined_fake_df.to_csv(combined_fake_filename, index=False)
    print(f"All fake CSVs combined into {combined_fake_filename}!")
else:
    print("No fake DataFrames were loaded.")


Successfully read https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Fake%20News%20Real_time%20Data/politifact_fact_checks.csv, shape: (90, 4)
Successfully read https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Fake%20News%20Real_time%20Data/politifact_fake_news.csv, shape: (90, 3)
Combined Fake DataFrame shape: (180, 4)
Columns in combined Fake DataFrame: ['headline', 'link', 'speaker_date', 'rating']
All fake CSVs combined into combined_fake_data.csv!


In [12]:
import pandas as pd

# Local CSV file names for fake and real news datasets
fake_news_file = "combined_fake_data.csv"
real_news_file = "combined_data.csv"

try:
    # Read the CSV files
    fake_df = pd.read_csv(fake_news_file)
    real_df = pd.read_csv(real_news_file)

    # Inspect columns for debugging
    print("Fake news columns:", fake_df.columns.tolist())
    print("Real news columns:", real_df.columns.tolist())

    # Process Fake News DataFrame:
    # Create an "id" column if not present (using index)
    if "id" not in fake_df.columns:
        fake_df["id"] = fake_df.index
    # Rename columns to standard names
    fake_df = fake_df.rename(columns={"headline": "title", "link": "news_url"})
    # Assign fake news label
    fake_df["label"] = 1
    # Subset to necessary columns
    fake_df = fake_df[["id", "news_url", "title", "label"]]

    # Process Real News DataFrame:
    # Rename columns: 'article_id' becomes 'id', and 'link' becomes 'news_url'
    real_df = real_df.rename(columns={"article_id": "id", "link": "news_url"})
    # Assign real news label
    real_df["label"] = 0
    # Subset to necessary columns; if 'id' is not present, you might create it as well
    if "id" not in real_df.columns:
        real_df["id"] = real_df.index
    real_df = real_df[["id", "news_url", "title", "label"]]

    # Merge the two datasets
    combined_df = pd.concat([fake_df, real_df], ignore_index=True)

    print("FakeNewsNet dataset loaded successfully!")
    print("Combined dataset shape:", combined_df.shape)
    print(combined_df.head())

    # Save the combined DataFrame to a CSV file
    combined_csv_filename = "FakeNewsNet_combined.csv"
    combined_df.to_csv(combined_csv_filename, index=False)
    print(f"Merged dataset saved to {combined_csv_filename}!")

except Exception as e:
    print(f"Error loading FakeNewsNet dataset: {e}")


Fake news columns: ['headline', 'link', 'speaker_date', 'rating']
Real news columns: ['article_id', 'title', 'link', 'keywords', 'creator', 'video_url', 'description', 'content', 'pubDate', 'pubDateTZ', 'image_url', 'source_id', 'source_priority', 'source_name', 'source_url', 'source_icon', 'language', 'country', 'category', 'ai_tag', 'sentiment', 'sentiment_stats', 'ai_region', 'ai_org', 'duplicate', 'source', 'author', 'url', 'urlToImage', 'publishedAt', 'published_at']
FakeNewsNet dataset loaded successfully!
Combined dataset shape: (357, 4)
  id                                           news_url  \
0  0  https://www.politifact.com/personalities/insta...   
1  1  https://www.politifact.com/personalities/viral...   
2  2  https://www.politifact.com/personalities/faceb...   
3  3  https://www.politifact.com/personalities/elon-...   
4  4  https://www.politifact.com/personalities/faceb...   

                                               title  label  
0  “Delta plane that crashed in 

In [13]:
all_data="FakeNewsNet_combined.csv"

In [14]:
all_data = "FakeNewsNet_combined.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(all_data)

# Now call .head(5) on the DataFrame
df.head(5)


Unnamed: 0,id,news_url,title,label
0,0,https://www.politifact.com/personalities/insta...,“Delta plane that crashed in Toronto operated ...,1
1,1,https://www.politifact.com/personalities/viral...,"Says Luigi Mangione posted on X, “He who saves...",1
2,2,https://www.politifact.com/personalities/faceb...,“The media is showing how Obama’s birth certif...,1
3,3,https://www.politifact.com/personalities/elon-...,President Joe Biden’s immigration policy was “...,1
4,4,https://www.politifact.com/personalities/faceb...,El congresista Adriano Espaillat muestra “La n...,1


In [15]:
df.describe()

Unnamed: 0,label
count,357.0
mean,0.504202
std,0.500684
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [16]:
# df.dropna()

In [17]:
df["clean_text"] = df["title"].apply(clean_text)
df["processed_text"] = df["clean_text"].apply(preprocess_text)
df["sentiment"] = df["processed_text"].apply(sentiment_score)

print("Data processing completed successfully!")
print(df[["title", "clean_text", "processed_text", "sentiment"]].head())

Data processing completed successfully!
                                               title  \
0  “Delta plane that crashed in Toronto operated ...   
1  Says Luigi Mangione posted on X, “He who saves...   
2  “The media is showing how Obama’s birth certif...   
3  President Joe Biden’s immigration policy was “...   
4  El congresista Adriano Espaillat muestra “La n...   

                                          clean_text  \
0  delta plane that crashed in toronto operated b...   
1  says luigi mangione posted on x he who saves h...   
2  the media is showing how obamas birth certific...   
3  president joe bidens immigration policy was a ...   
4  el congresista adriano espaillat muestra la nu...   

                                      processed_text  sentiment  
0  delta plane crashed toronto operated endeavor ...        0.0  
1  say luigi mangione posted x save country viola...        0.0  
2  medium showing obamas birth certificate change...       -0.1  
3  president joe biden

FEATURE ENGINEERING

In [18]:
# Word count
df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))

# Character count
df['char_count'] = df['clean_text'].apply(lambda x: len(x))


In [19]:
# Average word length
df['avg_word_length'] = df['char_count'] / (df['word_count'] + 1)


In [20]:
# Stopword count
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df['stopword_count'] = df['clean_text'].apply(
    lambda x: len([word for word in x.split() if word in stop_words])
)


In [21]:
# Subjectivity
df['subjectivity'] = df['processed_text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)


In [22]:
# Named entity count
def count_named_entities(text):
    doc = nlp(text)
    return len(doc.ents)

df['entity_count'] = df['processed_text'].apply(count_named_entities)


In [23]:
# POS features
def pos_counts(text):
    doc = nlp(text)
    noun_count = sum(1 for token in doc if token.pos_ == 'NOUN')
    verb_count = sum(1 for token in doc if token.pos_ == 'VERB')
    adj_count = sum(1 for token in doc if token.pos_ == 'ADJ')
    return noun_count, verb_count, adj_count

df[['noun_count', 'verb_count', 'adj_count']] = df['processed_text'].apply(
    lambda x: pd.Series(pos_counts(x))
)


In [24]:
# Count of uppercase words
df['uppercase_count'] = df['clean_text'].apply(
    lambda x: sum(1 for word in x.split() if word.isupper())
)


In [25]:
# Exclamation mark count
df['exclamation_count'] = df['clean_text'].apply(lambda x: x.count('!'))


In [26]:
df.to_csv("real_time_data_processed_with_features.csv", index=False)
print("Feature engineering completed and dataset saved!")


Feature engineering completed and dataset saved!


In [27]:
from google.colab import files

files.download("real_time_data_processed_with_features.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

WEEK 4

Step 1: Load Preprocessed Dataset

In [28]:
# import pandas as pd

# # URL to the processed_with_features.csv file
# real_time_processed_data_url = "https://raw.githubusercontent.com/prynka1808/Team-03-Fake-News-Detection-Finale-Project/main/Data Preprocessing/real_time_data_processed_with_features.csv"

# # Data Preprocessing/real_time_data_processed_with_features.csv
# # Load the dataset
# try:
#     all_data = pd.read_csv(processed_data_url)
#     print("Processed dataset loaded successfully!")
# except Exception as e:
#     print(f"Error loading the processed dataset: {e}")

# # Preview the dataset
# print(all_data.head())


Step 2: Preparing the Data for Modeling

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the dataset into features (X) and target (y)
X = df['processed_text']  # The processed text feature
y = df['label']  # The target labels (1 = Fake, 0 = Real)



# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Convert text data into TF-IDF embeddings
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_test)

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)  # shape will be (283, vocab_size)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF transformation completed!")


TF-IDF transformation completed!


In [30]:
print(X_train.shape )
print(X_test.shape)

(285,)
(72,)


Step 3: Build and Train Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the test data
y_pred_lr = logistic_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9861111111111112
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.97      1.00      0.99        37

    accuracy                           0.99        72
   macro avg       0.99      0.99      0.99        72
weighted avg       0.99      0.99      0.99        72



Step 4: Build and Train Support Vector Machine (SVM)

In [32]:
from sklearn.svm import SVC

# Initialize and train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the test data
y_pred_svm = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.9861111111111112
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.97      1.00      0.99        37

    accuracy                           0.99        72
   macro avg       0.99      0.99      0.99        72
weighted avg       0.99      0.99      0.99        72



Step 5: Save the Models and Embeddings

In [33]:
# import pickle

# # Save the TF-IDF vectorizer
# with open("Dataset/tfidf_vectorizer.pkl", "wb") as f:
#     pickle.dump(tfidf_vectorizer, f)
# print("TF-IDF vectorizer saved!")


In [34]:
# # Save Logistic Regression model
# with open("Dataset/logistic_model.pkl", "wb") as f:
#     pickle.dump(logistic_model, f)
# print("Logistic Regression model saved!")

# # Save SVM model
# with open("Dataset/svm_model.pkl", "wb") as f:
#     pickle.dump(svm_model, f)
# print("SVM model saved!")


In [35]:
# !ls /content/Dataset


In [36]:
# import shutil

# # Zip the Dataset folder
# shutil.make_archive("Dataset", "zip", "Dataset")

# # Download the zip file
# files.download("Dataset.zip")


Week 5 Training advanced NLP models (BERT, RoBERTa, and DistilBERT) for fake news classification.

1. Install required Libraries

In [38]:
# pip install transformers datasets scikit-learn


2. Load Data

In [40]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
from datasets import Dataset

# 1. Load the CSV
df = pd.read_csv("FakeNewsNet_combined.csv")

# 2. Decide which column has the text
#    E.g., if your DataFrame has "title" as the text:
text_column = "title"

# 3. We'll assume there's a "label" column with values 0 or 1
label_column = "label"

# 4. Train-test split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

# 5. Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))


3. Choose a Model (BERT, RoBERTa, or DistilBERT)

In [42]:
model_name = "bert-base-uncased"         # BERT
model_name = "roberta-base"            # RoBERTa
model_name = "distilbert-base-uncased" # DistilBERT


4. Tokenize the Data

In [43]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example[text_column],
        truncation=True,
        padding="max_length",
        max_length=128
    )

# Apply to train and validation sets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function,   batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/285 [00:00<?, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

5. Rename “label”

In [47]:
# Remove original text columns
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset   = val_dataset.rename_column("label", "labels")

train_dataset.set_format("torch")
val_dataset.set_format("torch")

print(train_dataset.features)


{'id': Value(dtype='string', id=None), 'news_url': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [48]:
# print("Train columns:", train_dataset.column_names)
# print("Val columns:", val_dataset.column_names)


6. Creating the Model & Training Arguments

In [49]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting up Hyperparameters

In [50]:
training_args = TrainingArguments(
    output_dir="model_output",
    evaluation_strategy="epoch",   # Evaluate every epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,            # Increase if you have enough data/time
    weight_decay=0.01,             # Common setting for regularization
    logging_dir="logs",            # For TensorBoard
    logging_steps=50,
    save_strategy="epoch"
)




Define and Compute Metrics Function

In [51]:
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     preds = np.argmax(logits, axis=-1)

#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
#     acc = accuracy_score(labels, preds)

#     return {
#         "accuracy": acc,
#         "precision": precision,
#         "recall": recall,
#         "f1": f1
#     }


Week 6 Fine-tuning models and performing hyperparameter optimization (grid search).

In [52]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

# Example: if you need to load your data
# df = pd.read_csv("FakeNewsNet_combined.csv")
# X = df["title"]  # or df["content"] if you have full text
# y = df["label"]  # 0 = real, 1 = fake

# Example train/test split (if not already done)
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     test_size=0.2,
#                                                     random_state=42,
#                                                     shuffle=True)

# Create a pipeline that vectorizes text and then applies SVC
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),     # Convert text to TF-IDF features
    ("svc", SVC())                    # Support Vector Classifier
])

# Define parameter grid for hyperparameter tuning
# Adjust the parameters as needed (kernels, Cs, etc.)
param_grid = {
    "tfidf__max_features": [1000, 2000],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "svc__kernel": ["linear", "rbf"],
    "svc__C": [0.1, 1, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="accuracy",       # or "f1", "precision", etc.
    cv=3,                     # 3-fold cross-validation
    verbose=2,                # show progress logs
    n_jobs=-1                 # use all available CPU cores
)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

# Show best parameters and best score from grid search
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters: {'svc__C': 1, 'svc__kernel': 'linear', 'tfidf__max_features': 1000, 'tfidf__ngram_range': (1, 1)}
Best CV Score: 0.9964912280701754


In [53]:
# Retrieve the best model
best_model = grid_search.best_estimator_

WEEK 7
Model evaluation (F1-score, precision, recall, confusion matrix,

In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Use the best model to predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate using multiple metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="binary")
recall = recall_score(y_test, y_pred, average="binary")
f1 = f1_score(y_test, y_pred, average="binary")

print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1)


Test Accuracy: 0.9861111111111112
Test Precision: 0.9736842105263158
Test Recall: 1.0
Test F1 Score: 0.9866666666666667


In [55]:
# Classification report (shows precision, recall, F1 per class)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.97      1.00      0.99        37

    accuracy                           0.99        72
   macro avg       0.99      0.99      0.99        72
weighted avg       0.99      0.99      0.99        72

Confusion Matrix:
 [[34  1]
 [ 0 37]]
