In [1]:
# installing the required libraries and packages
!pip install newsapi-python google-search-results pandas numpy requests tweepy praw

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting google-search-results
  Downloading google_search_results-2.4.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Building wheels for collected packages: google-search-results
  Building wheel for google-search-results (setup.

In [3]:
# importing the required modules
from google.colab import userdata
import requests
import pandas as pd
import numpy as np
from serpapi import GoogleSearch
import tweepy
import praw

In [4]:
#SETTING UP THE CONFIGURATION

# search keywords
QUERY = "industry trends OR competitor analysis OR market insights OR Artificial Intelligence"
MAX_RESULTS = 100  # to fetch 100 items per source
NEWSAPI_KEY = userdata.get("NEWS_API_KEY")           # From https://newsapi.org
SERPAPI_KEY = userdata.get("SERP_API_KEY")            # From https://serpapi.com
TWITTER_BEARERTOKEN = userdata.get("TWITTER_BEARERTOKEN" ) # From https://developer.x.com # as twitter has limits i am going to fetch only 30 tweets which are relevant
REDDIT_USER_AGENT=userdata.get("REDDIT_USER_AGENT")
REDDIT_CLIENT_ID=userdata.get("REDDIT_CLIENT_ID")
REDDIT_CLIENT_SECRET=userdata.get("REDDIT_CLIENT_SECRET")

In [5]:
#collect data from newsapi
def fetch_newsapi(query):
    url = f"https://newsapi.org/v2/everything?q={query}&pageSize={MAX_RESULTS}&apiKey={NEWSAPI_KEY}"
    response = requests.get(url)

    if response.status_code != 200:
        print("NewsAPI Error:", response.text)
        return []

    articles = response.json().get("articles", [])
    news_data = []

    for art in articles:
        news_data.append({
            "title": art.get("title"),
            "description": art.get("description"),
            "url": art.get("url"),
            "publishedAt": art.get("publishedAt"),
            "source": art.get("source", {}).get("name"),
            "type": "news",
            "content": art.get("content")  # Added content column
        })
    return news_data

In [6]:
#collect data from serp api
def fetch_serpapi(query):
    search = GoogleSearch({
        "q": query,
        "api_key": SERPAPI_KEY,
        "tbm": "nws",
        "num": MAX_RESULTS
    })

    results = search.get_dict().get("news_results", [])
    serp_data = []

    for item in results:
        serp_data.append({
            "title": item.get("title"),
            "description": item.get("snippet"),
            "url": item.get("link"),
            "publishedAt": item.get("date"),
            "source": item.get("source"),
            "type": "news",
            "content": item.get("snippet")  # Using snippet as content
        })
    return serp_data

In [7]:
# #collect data from twitter
# def fetch_twitter(query):
#     client = tweepy.Client(bearer_token=TWITTER_BEARERTOKEN)

#     tweets = client.search_recent_tweets(
#         query=query, tweet_fields=["created_at", "author_id", "text"], max_results=20
#     )
#     tweet_data = []

#     if tweets.data:
#         for tweet in tweets.data:
#             tweet_data.append({
#                 "title": tweet.text[:70] + "...",
#                 "description": tweet.text,
#                 "url": f"https://twitter.com/i/web/status/{tweet.id}",
#                 "publishedAt": tweet.created_at,
#                 "source": "Twitter",
#                 "type": "tweet",
#                 "content": tweet.text  # Added full tweet as content
#             })
#     return tweet_data

In [8]:
#data from reddit

def fetch_reddit(query="AI market trends", max_words=200):
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )

    submissions = reddit.subreddit("all").search(query, limit=60)

    reddit_data = []
    for submission in submissions:
        # Use selftext if available, else fallback to title
        raw_content = submission.selftext if submission.selftext else submission.title

        # Shorten content to max_words
        words = raw_content.split()
        if len(words) > max_words:
            raw_content = " ".join(words[:max_words]) + "..."

        reddit_data.append({
            "title": submission.title,
            "description": submission.selftext[:200] + "..." if submission.selftext else submission.title,
            "url": f"https://www.reddit.com{submission.permalink}",
            "publishedAt": pd.to_datetime(submission.created_utc, unit="s"),
            "source": "Reddit",
            "type": "reddit_post",
            "content": raw_content
        })
    return reddit_data


In [9]:
# DATA COLLECTION
def collect_all_data(query):
    newsapi_data = fetch_newsapi(query) # Fetch articles from NewsAPI
    serpapi_data = fetch_serpapi(query) # Fetch articles from Google SERP API
    #twitter_data = fetch_twitter(query) # Fetch tweets from Twitter API
    reddit_data = fetch_reddit(query=QUERY)   # Fetch Reddit posts

    # Combine only non-empty lists
    combined_data = []
    if newsapi_data:
        combined_data.extend(newsapi_data)
    if serpapi_data:
        combined_data.extend(serpapi_data)
    # if twitter_data:
    #     combined_data.extend(twitter_data)
    if reddit_data:
        combined_data.extend(reddit_data)

    if not combined_data:
        print("No data collected!")
        return pd.DataFrame()

    df = pd.DataFrame(combined_data)

    # Ensure that the type column is clean
    df['type'] = df['type'].str.lower().fillna("news")
    df.loc[df['url'].str.contains("twitter.com", case=False, na=False), 'type'] = "tweet"
    df.loc[df['url'].str.contains("reddit.com", case=False, na=False), 'type'] = "reddit_post"

    return df

In [10]:

df = collect_all_data(QUERY)
if not df.empty:
    output_path = "industry_insights_clean.csv"
    df.to_csv(output_path, index=False)
    print(f"Data collected and saved to: {output_path}")
else:
    print("No data to save.")

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Data collected and saved to: industry_insights_clean.csv


In [11]:
#  DATA CLEANING & SENTIMENT PREPROCESSING

import re

#Load the collected CSV
df = pd.read_csv("industry_insights_clean.csv")

#Remove duplicates (based on title and URL)
df.drop_duplicates(subset=["title", "url"], inplace=True)

#Handle missing values
df.dropna(subset=["title", "description"], how='all', inplace=True)
df["description"] = df["description"].fillna(df["title"])
df["content"] = df["content"].fillna(df["description"])

#Standardize date format
def clean_date(date_val):
    try:
        return pd.to_datetime(date_val).strftime("%Y-%m-%d %H:%M:%S")
    except:
        return None

df["publishedAt"] = df["publishedAt"].apply(clean_date)
df.dropna(subset=["publishedAt"], inplace=True)

#Trim whitespace in all string columns
str_cols = df.select_dtypes(include=['object']).columns
df[str_cols] = df[str_cols].apply(lambda x: x.str.strip())

#Remove rows with empty or very short content
# (Reddit posts may have empty content → fallback to title + description)
df["content"] = df.apply(
    lambda row: row["content"] if isinstance(row["content"], str) and len(row["content"]) > 20
    else (str(row["title"]) + " " + str(row["description"])),
    axis=1
)

#Remove HTML tags (your regex-based approach)
def clean_html(text):
    if pd.isna(text):
        return text
    return re.sub(r"<.*?>", "", str(text))

df["content"] = df["content"].apply(clean_html)
df["title"] = df["title"].apply(clean_html)
df["description"] = df["description"].apply(clean_html)

#Text preprocessing for sentiment analysis
def preprocess_text(text):
    if pd.isna(text):
        return text
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)  # remove mentions
    text = re.sub(r"#(\w+)", r"\1", text)  # remove hashtag symbol, keep word
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)  # remove special chars except basic punctuation
    text = re.sub(r"\s+", " ", text).strip()  # normalize spaces
    return text

df["content"] = df["content"].apply(preprocess_text)
df["title"] = df["title"].apply(preprocess_text)
df["description"] = df["description"].apply(preprocess_text)

#Reset index
df.reset_index(drop=True, inplace=True)

#Save cleaned and preprocessed data
cleaned_path = "preprocessed.csv"
df.to_csv(cleaned_path, index=False)
print(f"Cleaned & preprocessed data saved to: {cleaned_path}")


Cleaned & preprocessed data saved to: preprocessed.csv


In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd

df = pd.read_csv("preprocessed.csv")

# Load FinBERT for financial sentiment
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def analyze_sentiment(row):
    # Combine title + description + content
    text = f"{row['title']} {row['description']} {row['content']}"
    if pd.isna(text) or len(text.strip()) == 0:
        return {"label": "neutral", "score": 0.0}

    # Truncate to model limit
    result = finbert(text[:512])[0]
    return {"label": result['label'].lower(), "score": result['score']}

# Apply analysis and split into two new columns
sentiment_results = df.apply(analyze_sentiment, axis=1)
df["sentiment"] = sentiment_results.apply(lambda x: x["label"])
df["sentiment_score"] = sentiment_results.apply(lambda x: x["score"])

# Save with sentiment + score
output_path = "industry_insights_with_financial_sentiment.csv"
df.to_csv(output_path, index=False)
print(f"Sentiment analysis completed and saved to: {output_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use cpu


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Sentiment analysis completed and saved to: industry_insights_with_financial_sentiment.csv
