### **TEXT MINING PROJECT**  

---

### **SHOULD SOCIAL MEDIA COMPANIES BE HELD RESPONSIBLE FOR MISINFORMATION?**


#### **Setting up the drive**

In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Change working directory to the project folder
base_folder = "/content/drive/MyDrive/TextMining_Project"
os.chdir(base_folder)

#### **Installing and importing all the necessary libraries**

In [3]:
# Install necessary libraries if not already installed
!pip install requests pandas beautifulsoup4 feedparser transformers torch nltk scikit-learn

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting n

In [4]:
# Import required libraries
import requests
import pandas as pd
import feedparser
import time
from transformers import pipeline
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### **STEP-1: DATA COLLECTION**

#### **Listing the queries based on the project background**

In [6]:
QUERIES = [
    "misinformation regulation",
    "social media misinformation",
    "fake news regulation",
    "disinformation policies",
    "online content moderation",
    "digital information control"
]

#### **Collecting data using NewsAPI**

In [9]:
PAGE_SIZE = 100

NEWSAPI_KEY = '118e4e5f4af942f2bbd7d5cef37b9759'

# Function to fetch data from NewsAPI
def fetch_newsapi(query):
    articles = []
    for page in range(1, 6):  # Fetch multiple pages
        url = f"https://newsapi.org/v2/everything?q={query}&language=en&pageSize={PAGE_SIZE}&page={page}&apiKey={NEWSAPI_KEY}"
        response = requests.get(url).json()
        if "articles" in response:
            for article in response["articles"]:
                articles.append([
                    article["title"], article["description"], article["source"]["name"],
                    article["publishedAt"], article["url"]
                ])
    return articles

In [10]:
# Collecting articles from both APIs
articles_newsapi = []
for query in QUERIES:
    articles_newsapi.extend(fetch_newsapi(query))

print(articles_newsapi)



In [11]:
# Creating DataFrame
df_newsapi = pd.DataFrame(articles_newsapi, columns=["title", "description", "source", "published_date", "url"])

# Save to CSV
df_newsapi.to_csv("data_newsapi.csv", index=False)

# Show Summary
print(f"Total Articles Collected using NewsAPI: {len(df_newsapi)}")
df_newsapi.head()

Total Articles Collected using NewsAPI: 565


Unnamed: 0,title,description,source,published_date,url
0,"Safety Takes A Backseat At Paris AI Summit, As...","At the Paris AI Action Summit, safety concerns...",Time,2025-02-11T21:35:43Z,https://time.com/7221384/ai-regulation-takes-b...
1,Elon Musk says 'Make Europe Great Again' as he...,Elon Musk continues to champion right-wing pol...,Business Insider,2025-01-18T22:52:17Z,https://www.businessinsider.com/elon-musk-make...
2,Judge makes final decision on lawsuit alleging...,"""The City cannot have it both ways.""",Yahoo Entertainment,2025-02-10T11:00:41Z,https://www.yahoo.com/news/judge-makes-final-d...
3,A nutrition scientist was hooked on snacks lik...,Alex Ruani used to reach for sweet treats when...,Business Insider,2025-01-18T07:26:01Z,https://www.businessinsider.com/nutrition-scie...
4,Trump revokes AI risk regulation in day one ex...,The executive order demanded thorough research...,TechRadar,2025-01-21T16:29:00Z,https://www.techradar.com/pro/security/trump-r...


#### **Collecting Data using MediaStack API**

In [12]:
PAGE_SIZE = 100

MEDIASTACK_KEY = 'fcd0e09ea4960d8f86afe7d201c4b446'

# Function to fetch data from Mediastack
def fetch_mediastack(query):
    url = f"http://api.mediastack.com/v1/news?access_key={MEDIASTACK_KEY}&languages=en&keywords={query}&limit=100"
    response = requests.get(url).json()
    articles = []
    if "data" in response:
        for article in response["data"]:
            articles.append([
                article["title"], article["description"], article["source"],
                article["published_at"], article["url"]
            ])
    return articles

In [13]:
# Collecting articles from both APIs
articles_mediastack = []
for query in QUERIES:
    articles_mediastack.extend(fetch_mediastack(query))

print(articles_mediastack)



In [14]:
# Creating DataFrame
df_mediastack = pd.DataFrame(articles_mediastack, columns=["title", "description", "source", "published_date", "url"])

# Save to CSV
df_mediastack.to_csv("data_mediastackapi.csv", index=False)

# Show Summary
print(f"Total Articles Collected using MediaStack: {len(df_mediastack)}")
df_mediastack.head()

Total Articles Collected using MediaStack: 5


Unnamed: 0,title,description,source,published_date,url
0,'Adrenaline of the moment' drove children to t...,Children joined last summer’s riots driven by ...,Independent,2025-01-28T09:16:43+00:00,https://www.independent.co.uk/tv/news/southpor...
1,Biden warned of oligarchs. Some officials worr...,"By Kayla Tausche, CNN Washington (CNN) &#8212;...",krdo,2025-01-16T17:00:57+00:00,https://krdo.com/news/2025/01/16/biden-warned-...
2,Gavin Newsom issues strange plea to Joe Biden ...,California Gov. Gavin Newsom on Friday complai...,Mail,2025-01-10T22:16:13+00:00,https://www.dailymail.co.uk/news/article-14272...
3,The plague of social media misinformation,"In recent years, social media has become a pow...",Pakistan Today,2024-11-18T18:16:06+00:00,https://www.pakistantoday.com.pk/2024/11/18/th...
4,Online 'content moderators' sue Facebook over ...,"A number of people who worked as ""content mode...",breakingnews,2024-11-05T16:52:23+00:00,https://www.breakingnews.ie/ireland/online-con...


#### **Collecting data using WebScraping from Google News RSS**

In [15]:
# Google News RSS Base URL
RSS_BASE_URL = "https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en"

# List to Store Articles
articles_google_rss = []

# Fetch Data for Each Query
for query in QUERIES:
    rss_url = RSS_BASE_URL.format(query.replace(" ", "+"))  # Format query for URL
    feed = feedparser.parse(rss_url)

    for entry in feed.entries:
        articles_google_rss.append({
            "query": query,
            "title": entry.title,
            "summary": entry.summary,
            "published_date": entry.published,
            "url": entry.link,
            "source": entry.source.title if 'source' in entry else "Google News"
        })

    print(f"Collected {len(feed.entries)} articles for query: {query}")
    time.sleep(2)  # Add delay to avoid hitting Google's request limits

print(articles_google_rss)

Collected 99 articles for query: misinformation regulation
Collected 100 articles for query: social media misinformation
Collected 100 articles for query: fake news regulation
Collected 100 articles for query: disinformation policies
Collected 100 articles for query: online content moderation
Collected 68 articles for query: digital information control


In [16]:
# Convert to DataFrame
df_google_rss = pd.DataFrame(articles_google_rss)

# Save to CSV
df_google_rss.to_csv("data_google_news_rss.csv", index=False)

# Show Summary
print(f"Total RSS News Articles Collected: {len(df_google_rss)}")
df_google_rss.head()

Total RSS News Articles Collected: 567


Unnamed: 0,query,title,summary,published_date,url,source
0,misinformation regulation,Election Security and Misinformation Regulatio...,"<a href=""https://news.google.com/rss/articles/...","Sat, 19 Oct 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMikgFBV...,The Regulatory Review
1,misinformation regulation,JD Vance Knocks EU’s Regulation Of US Tech Gia...,"<a href=""https://news.google.com/rss/articles/...","Tue, 11 Feb 2025 12:52:05 GMT",https://news.google.com/rss/articles/CBMiyAFBV...,Forbes
2,misinformation regulation,Australia abandons legislation requiring socia...,"<a href=""https://news.google.com/rss/articles/...","Mon, 25 Nov 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMi1AFBV...,JURIST
3,misinformation regulation,(PDF) Third person effects of fake news: Fake ...,"<a href=""https://news.google.com/rss/articles/...","Fri, 31 Jan 2025 08:00:00 GMT",https://news.google.com/rss/articles/CBMi1AFBV...,ResearchGate
4,misinformation regulation,Vatican urges regulation of AI to prevent misi...,"<a href=""https://news.google.com/rss/articles/...","Thu, 30 Jan 2025 08:00:00 GMT",https://news.google.com/rss/articles/CBMiigFBV...,Dig Watch Updates


#### **Merging all the collected data**

In [17]:
# Show Data Summary
print(f"NewsAPI Dataset Size: {len(df_newsapi)}")
print(f"MediaStack Dataset Size: {len(df_mediastack)}")
print(f"Web Scraped Google RSS Dataset Size: {len(df_google_rss)}")

NewsAPI Dataset Size: 565
MediaStack Dataset Size: 5
Web Scraped Google RSS Dataset Size: 567


In [18]:
# Rename Columns in Google RSS Data
df_google_rss = df_google_rss.rename(columns={"summary": "description"})

# Keep Only Necessary Columns
df_newsapi = df_newsapi[["title", "description", "source", "published_date", "url"]]
df_mediastack = df_mediastack[["title", "description", "source", "published_date", "url"]]
df_google_rss = df_google_rss[["title", "description", "source", "published_date", "url"]]

# Merge Both Datasets
df_combined = pd.concat([df_newsapi, df_mediastack, df_google_rss]).drop_duplicates()

# Save Merged Dataset
df_combined.to_csv("final_data_misinformation.csv", index=False)

# Show Summary
print(f"Final Merged Dataset Size: {len(df_combined)}")
df_combined.head()

Final Merged Dataset Size: 1059


Unnamed: 0,title,description,source,published_date,url
0,"Safety Takes A Backseat At Paris AI Summit, As...","At the Paris AI Action Summit, safety concerns...",Time,2025-02-11T21:35:43Z,https://time.com/7221384/ai-regulation-takes-b...
1,Elon Musk says 'Make Europe Great Again' as he...,Elon Musk continues to champion right-wing pol...,Business Insider,2025-01-18T22:52:17Z,https://www.businessinsider.com/elon-musk-make...
2,Judge makes final decision on lawsuit alleging...,"""The City cannot have it both ways.""",Yahoo Entertainment,2025-02-10T11:00:41Z,https://www.yahoo.com/news/judge-makes-final-d...
3,A nutrition scientist was hooked on snacks lik...,Alex Ruani used to reach for sweet treats when...,Business Insider,2025-01-18T07:26:01Z,https://www.businessinsider.com/nutrition-scie...
4,Trump revokes AI risk regulation in day one ex...,The executive order demanded thorough research...,TechRadar,2025-01-21T16:29:00Z,https://www.techradar.com/pro/security/trump-r...


#### **Creating the labeled data from the final data**

In [19]:
# Load the dataset
df = df_combined.copy()

# Load a pre-trained BERT model for text classification
classifier = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Function to map BERT labels to 3 categories
def map_bert_to_custom_label(text):
    try:
        sentiment = classifier(text)[0]["label"]

        if sentiment in ["4 stars", "5 stars"]:
            return "pro-regulation"
        elif sentiment in ["1 star", "2 stars"]:
            return "anti-regulation"
        else:
            return "neutral"
    except:
        return "neutral"  # Default if there's an error

# Apply BERT classification with custom mapping
df["bert_label"] = df["title"].apply(map_bert_to_custom_label)

# Save the BERT-labeled dataset
df.to_csv("labeled_data_misinformation.csv", index=False)

# Show summary of new labels
print("Label distribution:\n", df["bert_label"].value_counts())
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


Label distribution:
 bert_label
anti-regulation    561
pro-regulation     457
neutral             41
Name: count, dtype: int64


Unnamed: 0,title,description,source,published_date,url,bert_label
0,"Safety Takes A Backseat At Paris AI Summit, As...","At the Paris AI Action Summit, safety concerns...",Time,2025-02-11T21:35:43Z,https://time.com/7221384/ai-regulation-takes-b...,anti-regulation
1,Elon Musk says 'Make Europe Great Again' as he...,Elon Musk continues to champion right-wing pol...,Business Insider,2025-01-18T22:52:17Z,https://www.businessinsider.com/elon-musk-make...,pro-regulation
2,Judge makes final decision on lawsuit alleging...,"""The City cannot have it both ways.""",Yahoo Entertainment,2025-02-10T11:00:41Z,https://www.yahoo.com/news/judge-makes-final-d...,anti-regulation
3,A nutrition scientist was hooked on snacks lik...,Alex Ruani used to reach for sweet treats when...,Business Insider,2025-01-18T07:26:01Z,https://www.businessinsider.com/nutrition-scie...,pro-regulation
4,Trump revokes AI risk regulation in day one ex...,The executive order demanded thorough research...,TechRadar,2025-01-21T16:29:00Z,https://www.techradar.com/pro/security/trump-r...,anti-regulation


### **STEP-2: DATA PRE-PROCESSING**

#### **Loading the dataframe**

In [20]:
# Load the labeled dataset
df = pd.read_csv("labeled_data_misinformation.csv")
df.head()

Unnamed: 0,title,description,source,published_date,url,bert_label
0,"Safety Takes A Backseat At Paris AI Summit, As...","At the Paris AI Action Summit, safety concerns...",Time,2025-02-11T21:35:43Z,https://time.com/7221384/ai-regulation-takes-b...,anti-regulation
1,Elon Musk says 'Make Europe Great Again' as he...,Elon Musk continues to champion right-wing pol...,Business Insider,2025-01-18T22:52:17Z,https://www.businessinsider.com/elon-musk-make...,pro-regulation
2,Judge makes final decision on lawsuit alleging...,"""The City cannot have it both ways.""",Yahoo Entertainment,2025-02-10T11:00:41Z,https://www.yahoo.com/news/judge-makes-final-d...,anti-regulation
3,A nutrition scientist was hooked on snacks lik...,Alex Ruani used to reach for sweet treats when...,Business Insider,2025-01-18T07:26:01Z,https://www.businessinsider.com/nutrition-scie...,pro-regulation
4,Trump revokes AI risk regulation in day one ex...,The executive order demanded thorough research...,TechRadar,2025-01-21T16:29:00Z,https://www.techradar.com/pro/security/trump-r...,anti-regulation


#### **Performing some cleaning operations**

In [21]:
# Drop rows with missing values in important columns
df.dropna(subset=["title", "description", "bert_label"], inplace=True)

# Considering only title as text for better results
df["text"] = df["title"]

# Convert text to lowercase for consistency
df["text"] = df["text"].str.lower()

# Tokenization Function
def tokenize(text):
    return word_tokenize(text)

# Apply Tokenization
df["tokens"] = df["text"].apply(tokenize)

#### **Stemming DataFrame**

In [22]:
stemmer = PorterStemmer()

def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]

df["stemmed_tokens"] = df["tokens"].apply(stem_words)
df["stemmed_text"] = df["stemmed_tokens"].apply(lambda x: " ".join(x))

# Save Stemming DataFrame
df_stemmed = df[["stemmed_text", "bert_label"]]
df_stemmed.to_csv("stemmed_data.csv", index=False)

# Displaying the stemmed dataframe
df_stemmed.head()

Unnamed: 0,stemmed_text,bert_label
0,"safeti take a backseat at pari ai summit , as ...",anti-regulation
1,elon musk say 'make europ great again ' as he ...,pro-regulation
2,judg make final decis on lawsuit alleg major c...,anti-regulation
3,a nutrit scientist wa hook on snack like choco...,pro-regulation
4,trump revok ai risk regul in day one execut order,anti-regulation


#### **Lemmatization DataFrame**

In [23]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df["lemmatized_tokens"] = df["tokens"].apply(lemmatize_words)
df["lemmatized_text"] = df["lemmatized_tokens"].apply(lambda x: " ".join(x))

# Save Lemmatization DataFrame
df_lemmatized = df[["lemmatized_text", "bert_label"]]
df_lemmatized.to_csv("lemmatized_data.csv", index=False)

# Displaying the lemmatized dataframe
df_lemmatized.head()

Unnamed: 0,lemmatized_text,bert_label
0,"safety take a backseat at paris ai summit , a ...",anti-regulation
1,elon musk say 'make europe great again ' a he ...,pro-regulation
2,judge make final decision on lawsuit alleging ...,anti-regulation
3,a nutrition scientist wa hooked on snack like ...,pro-regulation
4,trump revoke ai risk regulation in day one exe...,anti-regulation


#### **CountVectorizer DataFrame**

In [24]:
vectorizer = CountVectorizer(max_df=0.9, min_df=5, max_features=5000)
X_count = vectorizer.fit_transform(df["lemmatized_text"])

# Convert to DataFrame
df_countvectorized = pd.DataFrame(X_count.toarray(), columns=vectorizer.get_feature_names_out())
df_countvectorized["bert_label"] = df["bert_label"].values

# Save CountVectorizer DataFrame
df_countvectorized.to_csv("countvectorized_data.csv", index=False)

# Displaying the countvectorized dataframe
df_countvectorized.head()

Unnamed: 0,19,20,2024,2025,about,access,account,act,action,ad,...,without,work,world,year,york,you,your,youtube,zuckerberg,bert_label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anti-regulation
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pro-regulation
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anti-regulation
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,pro-regulation
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,anti-regulation


#### **TF-IDF Vectorizer DataFrame**

In [25]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=5, max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df["lemmatized_text"])

# Convert to DataFrame
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df_tfidf["bert_label"] = df["bert_label"].values

# Save TF-IDF DataFrame
df_tfidf.to_csv("tfidf_data.csv", index=False)

# Displaying the TF-IDF dataframe
df_tfidf.head()

Unnamed: 0,19,20,2024,2025,about,access,account,act,action,ad,...,without,work,world,year,york,you,your,youtube,zuckerberg,bert_label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,anti-regulation
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pro-regulation
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,anti-regulation
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.397213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pro-regulation
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,anti-regulation
