# Baseline

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Import the library to mount Google Drive
from google.colab import drive

# Mount the Google Drive at /content/drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# %cd /content/drive/My\ Drive/Colab\ Notebooks

In [4]:
%cd /content/drive/MyDrive/ml_for_cyber_wi25

/content/drive/MyDrive/ml_for_cyber_wi25


In [6]:
!ls

captcha-images	       cooccurrence_urls.npz  model-basic.h5  __pycache__
captcha-images.tar.xz  hw2_util.py	      news.csv	      waf-urls.csv
cooccurrence_test.npz  Lab-8		      News_dataset    waf-urls.tar.xz


In [7]:
#We're going to use the smaller dataset as hold out to test the performance of our model
df_holdout=pd.read_csv('news.csv')

df_holdout['label'] = df_holdout['label'].map({'REAL': 0, 'FAKE': 1})

#Get shape and head
df_holdout.shape
df_holdout.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0


In [8]:
news_dataset_location = 'News_dataset/Fake.csv'

In [9]:
#Read the training and test data we'll be using from the fake news dataset
df_fake=pd.read_csv(news_dataset_location)
df_fake['label'] = 1

#Get shape and head
df_fake.shape
df_fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [10]:
news_dataset_location = 'News_dataset/True.csv'

In [11]:
#Read the data from the real news
df_true=pd.read_csv(news_dataset_location)
df_true['label'] = 0

#Get shape and head
df_true.shape
df_true.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [12]:
# Concatenate the three dataframes
df_combined = pd.concat([df_true[['title', 'text', 'label']],
                         df_fake[['title', 'text', 'label']],
                         df_holdout[['title', 'text', 'label']]],
                        ignore_index=True)

# Shuffle the rows
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the combined dataframe
df_combined.head()

Unnamed: 0,title,text,label
0,Trump campaign woes intensify amid questions o...,Republicans face a lot of difficult decisions ...,0
1,Lebanon PM Hariri to supporters: 'I'm staying ...,BEIRUT (Reuters) - Lebanese Prime Minister Saa...,0
2,New 9/11 Trailer – Featuring Charlie Sheen and...,21st Century Wire says Everything changed on 9...,1
3,After Brussels Terror Attack Republican Senat...,Always eager to use a tragedy to erode the Con...,1
4,LOL! WATCH DEMOCRAT DINGBAT Sheila Jackson Lee...,CNN s Allison Camerota started out her intervi...,1


In [13]:
#Split the dataset
#Val datasets can be used for hyperparameter tuning and other adjustments
#Test dataset are not used for any training feedback mechanisms and only for evaluation

x_train,x_test,y_train,y_test=train_test_split(df_combined['text'],
                                                     df_combined['label'],
                                                     test_size=0.3,
                                                     random_state=7)

# x_train,x_val,y_train,y_val=train_test_split(x_train, y_train, test_size=0.2,
#                                                random_state=7)


Generate features using the TfidfVectorizer

In [14]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train)
tfidf_val=tfidf_vectorizer.transform(x_test)
tfidf_test=tfidf_vectorizer.transform(x_test)

In [15]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def get_performance(model, X, y):
  y_pred = model.predict(X)
  acc = accuracy_score(y, y_pred)
  f1 = f1_score(y, y_pred)
  prec = precision_score(y, y_pred)
  recall = recall_score(y, y_pred)

  return {'accuracy': acc, 'f1': f1, 'precision': prec, 'recall': recall}


In [14]:
#Initialize a Logistic Regression Classifier with default parameters
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

lr = LogisticRegression()
lr.fit(tfidf_train,y_train)

metrics = get_performance(lr, tfidf_test, y_test)
print(metrics)


{'accuracy': 0.9627207963306333, 'f1': 0.964398881640261, 'precision': 0.9567307692307693, 'recall': 0.9721909056745585}


Perform some hyperparameter tuning for the Logistic Regression Model

In [15]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

params_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': [None, 'l2'],
    'max_iter': [100, 250, 500, 750, 1000]}

lr = LogisticRegression()
lr_rs = RandomizedSearchCV(lr, params_grid, refit=True, scoring='f1_micro', verbose=1, cv = 3, random_state=0)
lr_rs.fit(tfidf_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [16]:
lr_rs.best_params_

{'penalty': 'l2', 'max_iter': 100, 'C': 100}

In [17]:
get_performance(lr_rs.best_estimator_, tfidf_test, y_test)

{'accuracy': 0.9749194886308188,
 'f1': 0.975961088766252,
 'precision': 0.971689327621531,
 'recall': 0.9802705749718151}

Next we try a PassiveAggressiveClassifier

In [18]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

print('Test', get_performance(pac, tfidf_test, y_test))

Test {'accuracy': 0.973748414170001, 'f1': 0.9747773089545242, 'precision': 0.9728616881901554, 'recall': 0.9767004885381435}


In [16]:
from sklearn.ensemble import RandomForestClassifier

rfc=RandomForestClassifier(n_estimators=100)
rfc.fit(tfidf_train,y_train)
print('Test', get_performance(rfc, tfidf_test, y_test))

Test {'accuracy': 0.9541314248536109, 'f1': 0.9562465090299758, 'precision': 0.9458563535911603, 'recall': 0.9668674698795181}


In [17]:
print(tfidf_train.shape)

(35863, 121079)


In [22]:
# Hyperparameter Tuning on RFC
from sklearn.model_selection import RandomizedSearchCV

params_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 5, 8, 12, 20],
    'max_features': ['sqrt', 'log2', None]
    }

rf = RandomForestClassifier()
rf_rs = RandomizedSearchCV(rf, params_grid, refit=True, scoring='f1_micro', n_iter=5,
                           verbose=2, cv = 3, random_state=0)
rf_rs.fit(tfidf_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END ...max_depth=12, max_features=log2, n_estimators=10; total time=   0.5s
[CV] END ...max_depth=12, max_features=log2, n_estimators=10; total time=   0.7s
[CV] END ...max_depth=12, max_features=log2, n_estimators=10; total time=   0.8s
[CV] END ...max_depth=20, max_features=sqrt, n_estimators=50; total time=  12.5s
[CV] END ...max_depth=20, max_features=sqrt, n_estimators=50; total time=  13.8s
[CV] END ...max_depth=20, max_features=sqrt, n_estimators=50; total time=  13.9s
[CV] END ...max_depth=12, max_features=sqrt, n_estimators=10; total time=   3.2s
[CV] END ...max_depth=12, max_features=sqrt, n_estimators=10; total time=   3.2s
[CV] END ...max_depth=12, max_features=sqrt, n_estimators=10; total time=   1.5s
[CV] END .max_depth=None, max_features=log2, n_estimators=50; total time=  46.4s
[CV] END .max_depth=None, max_features=log2, n_estimators=50; total time=  47.6s
[CV] END .max_depth=None, max_features=log2, n_es

In [23]:
rf_rs.best_params_
get_performance(rf_rs.best_estimator_, tfidf_test, y_test)

{'accuracy': 0.9309043591411841,
 'f1': 0.9346943795351126,
 'precision': 0.9163250542560888,
 'recall': 0.9538152610441767}

In this case due to saving time I picked only 5 iterations of Randomized Search and what we can conclude is that one of these selections is superior to having more estimators - the out of the box model we built with RF with 100 estimators is still better than these. We can try a couple more parameters with higher tree count just to validate this hypothesis.

In [27]:
# Hyperparameter Tuning on RFC
# This ran previously but it hung during my latest run
from sklearn.model_selection import RandomizedSearchCV

params_grid = {
    'n_estimators': [100, 150, 200],
    }

rf = RandomForestClassifier()
rf_rs = RandomizedSearchCV(rf, params_grid, refit=True, scoring='f1_micro', n_iter=3,
                           verbose=3, cv = 2, random_state=0)
rf_rs.fit(tfidf_train, y_train)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


KeyboardInterrupt: 

Sometimes the Randomized Search CV hangs. But in this case we can still test out the theory that more trees are better with a single RF fitted with 200 trees to see if it outperforms the original

In [28]:
rf = RandomForestClassifier(n_estimators=200)

rf.fit(tfidf_train, y_train)
get_performance(rf, tfidf_test, y_test)

KeyboardInterrupt: 

# Train on Both Text and Title

Previously we only trained the model based on the text content of news. Let's try training on the combination of text and title and see if we can get better results.

In [45]:
# Define features (X) and target (y)
X = df_combined[['title', 'text']]  # Select both columns
y = df_combined['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [46]:
#Initialize a TfidfVectorizer for text
tfidf_vectorizer_text=TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit and transform train set, transform test set
tfidf_train_text=tfidf_vectorizer_text.fit_transform(X_train['text'])
tfidf_test_text=tfidf_vectorizer_text.transform(X_test['text'])

In [47]:
#Initialize a TfidfVectorizer for title
tfidf_vectorizer_title=TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit and transform train set, transform test set
tfidf_train_title=tfidf_vectorizer_title.fit_transform(X_train['title'])
tfidf_test_title=tfidf_vectorizer_title.transform(X_test['title'])

In [48]:
from scipy.sparse import hstack
tfidf_train = hstack([tfidf_train_text, tfidf_train_title])
tfidf_test = hstack([tfidf_test_text, tfidf_test_title])

In [49]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)

get_performance(pac, tfidf_test, y_test)

{'accuracy': 0.9754716981132076,
 'f1': 0.976385844033824,
 'precision': 0.9744936234058514,
 'recall': 0.9782854273879754}

We got better score for the test set for both accuracy and F1 score after we train on both the text and title features. Let's see what else we can improve.

In [50]:
# We do a Random Forest Classifier as well
rfc=RandomForestClassifier(n_estimators=50, warm_start = True)
rfc.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
get_performance(rfc, tfidf_test, y_test)

{'accuracy': 0.9515940143135979,
 'f1': 0.9537370973759483,
 'precision': 0.9450400492914356,
 'recall': 0.9625957072925819}

#Manual Ensembling

In [53]:
# We do a Logistic Regression using the best params found in earlier hyperparameter search
from sklearn.linear_model import LogisticRegression

params = {'penalty': 'l2', 'max_iter': 100, 'C': 100}
lr=LogisticRegression(**params)
lr.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
get_performance(lr, tfidf_test, y_test)

{'accuracy': 0.9754066363044893,
 'f1': 0.9763543100212686,
 'precision': 0.9731886768923806,
 'recall': 0.9795406049956069}

In [54]:
from sklearn.ensemble import VotingClassifier

# Create a VotingClassifier with 'soft' voting
voting_clf = VotingClassifier(estimators=[('lr', lr), ('rfc', rfc)], voting='soft')

# Fit the VotingClassifier on the training data (tfidf_train and y_train)
voting_clf.fit(tfidf_train, y_train)

get_performance(voting_clf, tfidf_test, y_test)

{'accuracy': 0.9756668835393624,
 'f1': 0.9766337623391228,
 'precision': 0.9722602313720612,
 'recall': 0.9810468181247647}

As we can see, our models are all performing generally VERY well. The random forest on its own is the worst model but that's probably due to the number of trees we are using here. The logistic regression does very well with F1 of 97.6%. The voting classifier is made up of an ensemble model (Random Forest) plus a Logistic Regression to perform its voting and we see its performance very slightly outpaces the Logistic Regression on its own, which shows that combining several models can be very effective!

# Using roBERTa embeddings instead of TF-IDF

In [55]:
# Chen to do, can use voting CLF or passive aggressive or just pick best classifier overall

# Using GANs to learn about Fake News

In [None]:
# Rachel to do

# Use an embedding model for similarity search

1.   List item
2.   List item



Previously we used the TfidfVectorizer to encode text and title of our news data. It is not the best way to encode text information as it disregards the context. Let's try using a transformer based approach such as sentence transformer to encode the features.





1. Let's use Pinecone API to generate and store the embeddings for news text and title.
2. Use the generated embeddings to perform similarity search and predict the label of hold out set


In [None]:
X = df_combined[['title', 'text']]  # Select both columns
X['id'] = X.index.astype(str)
y = df_combined['label']

In [None]:
pip install "pinecone[grpc]"

Collecting pinecone[grpc]
  Downloading pinecone-6.0.1-py3-none-any.whl.metadata (8.8 kB)
Collecting lz4>=3.1.3 (from pinecone[grpc])
  Downloading lz4-4.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone[grpc])
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting protobuf<6.0,>=5.29 (from pinecone[grpc])
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting protoc-gen-openapiv2<0.0.2,>=0.0.1 (from pinecone[grpc])
  Downloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl.metadata (1.5 kB)
Downloading lz4-4.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Downloading protobuf-5.29.3-cp38-abi3-manylin

In [None]:
# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time

with open('pinecone_api_key.txt', 'r') as f:
    api_key = f.readline().strip()

# Initialize a Pinecone client with the API key
pc = Pinecone(api_key=api_key)

# Define a sample dataset where each item has a unique ID and piece of text
# data = [
#     {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
#     {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
#     {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
#     {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
#     {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
#     {"id": "vec6", "text": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership."}
# ]

# # Convert the text into numerical vectors that Pinecone can index
# embeddings = pc.inference.embed(
#     model="multilingual-e5-large",
#     inputs=[d['text'] for d in data],
#     parameters={"input_type": "passage", "truncate": "END"}
# )

# print(embeddings)

In [None]:
index_name = "fake-news-train-test-index"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region='us-east-1'
    )
)

In [None]:
text_namespace='text'
title_namespace='title'

In [None]:
# index_train_test.delete(delete_all=True, namespace=text_namespace)
# index_train_test.delete(delete_all=True, namespace=title_namespace)

PineconeException: UNKNOWN:Error received from peer  {grpc_message:"Namespace not found", grpc_status:5, created_time:"2025-02-17T19:23:27.350046488+00:00"}

In [None]:
# Target the index where you'll store the vector embeddings
index_train_test = pc.Index(index_name)

In [None]:
i=4350
import time
while i<len(X):
    time.sleep(10)
    l=i
    r=min(i+50,len(X))
    print(l,r)


    # Prepare the records for upsert
    # Each contains an 'id', the embedding 'values', and the original text as 'metadata'
    text_records = []

    # Convert the text into numerical vectors that Pinecone can index
    text_embeddings = pc.inference.embed(
        model="multilingual-e5-large",
        inputs=X['text'][l:r].tolist(),
        parameters={"input_type": "passage", "truncate": "END"}
    )


    for d, e in zip(X[l:r].iterrows(), text_embeddings):
        metadata=d[1]['text']

        if len(metadata.encode('utf-8')) > 40960:
            metadata = metadata.encode('utf-8')[:40000].decode('utf-8', errors='ignore')

        text_records.append({
            "id": 'text'+d[1]['id'],
            "values": e['values'],
            "metadata": {'text': metadata}
        })

    # Upsert the records into the index
    index_train_test.upsert(
        vectors=text_records,
        namespace=text_namespace
    )

    # Prepare the records for upsert
    # Each contains an 'id', the embedding 'values', and the original text as 'metadata'
    title_records = []

    # Convert the text into numerical vectors that Pinecone can index
    title_embeddings = pc.inference.embed(
        model="multilingual-e5-large",
        inputs=X['title'][l:r].tolist(),
        parameters={"input_type": "passage", "truncate": "END"}
    )


    for d, e in zip(X[l:r].iterrows(), title_embeddings):
        title_records.append({
            "id": 'title'+d[1]['id'],
            "values": e['values'],
            "metadata": {'title': d[1]['title']}
        })

    # Upsert the records into the index
    index_train_test.upsert(
        vectors=title_records,
        namespace=title_namespace
    )

    i+=50


4350 4400
4400 4450
4450 4500
4500 4550
4550 4600
4600 4650
4650 4700
4700 4750
4750 4800
4800 4850
4850 4900
4900 4950
4950 5000
5000 5050
5050 5100
5100 5150
5150 5200
5200 5250
5250 5300
5300 5350
5350 5400
5400 5450
5450 5500
5500 5550
5550 5600
5600 5650
5650 5700
5700 5750
5750 5800
5800 5850
5850 5900
5900 5950
5950 6000
6000 6050
6050 6100
6100 6150
6150 6200
6200 6250
6250 6300
6300 6350
6350 6400
6400 6450
6450 6500
6500 6550
6550 6600
6600 6650
6650 6700
6700 6750


KeyboardInterrupt: 

In [None]:
tfidf_train_text=tfidf_vectorizer_text.fit_transform(X_train['text'])
tfidf_test_text=tfidf_vectorizer_text.transform(X_test['text'])
tfidf_holdout_text=tfidf_vectorizer_text.transform(X_holdout['text'])