In [17]:
# installing the required libraries and packages
!pip install newsapi-python google-search-results pandas numpy requests tweepy



In [18]:
# importing the required modules
from google.colab import userdata
import requests
import pandas as pd
import numpy as np
from serpapi import GoogleSearch
import tweepy

In [19]:
# ===================== SETTING UP THE CONFIGURATION =====================

# search keywords
QUERY = "industry trends OR competitor analysis OR market insights OR Artificial Intelligence"
MAX_RESULTS = 100  # to fetch 100 items per source
NEWSAPI_KEY = userdata.get("NEWS_API_KEY")           # From https://newsapi.org
SERPAPI_KEY = userdata.get("SERP_API_KEY")            # From https://serpapi.com
TWITTER_BEARERTOKEN = userdata.get("TWITTER_BEARERTOKEN" ) # From https://developer.x.com # as twitter has limits i am going to fetch only 30 tweets which are relevant

In [20]:
# ===================== COLLECTING DATA FROM NEWSAPI =====================
def fetch_newsapi(query):
    url = f"https://newsapi.org/v2/everything?q={query}&pageSize={MAX_RESULTS}&apiKey={NEWSAPI_KEY}"

    # Send an HTTP GET request to the NewsAPI endpoint
    response = requests.get(url)

    # Check if the request was successful (status code 200 = OK)
    if response.status_code != 200:
        print("NewsAPI Error:", response.text)
        return []

    # Extract the list of articles from the JSON response
    articles = response.json().get("articles", [])
    news_data = []

     # Loop through each article returned by the API
    for art in articles:
        news_data.append({
            "title": art.get("title"),
            "description": art.get("description"),
            "url": art.get("url"),
            "publishedAt": art.get("publishedAt"),
            "source": art.get("source", {}).get("name"),
            "type": "news"
        })
    return news_data

In [21]:
# ===================== COLLECTING DATA FROM SERPAPI =====================
def fetch_serpapi(query):
    # Creating a GoogleSearch object from SerpAPI with the following parameters:
    # "q": The search query (keywords for news)
    # "api_key": Your SerpAPI key for authentication
    # "tbm": "nws" specifies that we want to search only in Google News
    # "num": The maximum number of results to fetch (set to MAX_RESULTS)
    search = GoogleSearch({
        "q": query,
        "api_key": SERPAPI_KEY,
        "tbm": "nws",  # news search
        "num": MAX_RESULTS
    })

    # Convert the search results into a dictionary and extract only the 'news_results' field.
    results = search.get_dict().get("news_results", [])
    serp_data = []
    for item in results:
        serp_data.append({
            "title": item.get("title"),
            "description": item.get("snippet"),
            "url": item.get("link"),
            "publishedAt": item.get("date"),
            "source": item.get("source"),
            "type": "news"
        })
    return serp_data

In [22]:
# ===================== COLLECTING DATA FROM TWITTER =====================
def fetch_twitter(query):

  # Create a Twitter API client using Tweepy with the Bearer Token
    client = tweepy.Client(bearer_token=TWITTER_BEARERTOKEN)

    # Search for recent tweets matching the query
    # query: search keywords or hashtags
    # tweet_fields: specify what fields to retrieve (created_at = timestamp, author_id = user)
    # max_results: maximum number of tweets to retrieve (limited to 100 per request)
    tweets = client.search_recent_tweets(
        query=query, tweet_fields=["created_at", "author_id"], max_results=30
    )
    tweet_data = []
    if tweets.data:
        for tweet in tweets.data:
            tweet_data.append({
                "title": tweet.text[:70] + "...",  # Short preview
                "description": tweet.text,
                "url": f"https://twitter.com/i/web/status/{tweet.id}",
                "publishedAt": tweet.created_at,
                "source": "Twitter",
                "type": "tweet"
            })
    return tweet_data

In [23]:
# ===================== DATA COLLECTION =====================
def collect_all_data(query):
    newsapi_data = fetch_newsapi(query) # Fetch articles from NewsAPI
    serpapi_data = fetch_serpapi(query) # Fetch articles from Google SERP API
    twitter_data = fetch_twitter(query) # Fetch tweets from Twitter API


    # Combine only non-empty lists
    combined_data = []
    if newsapi_data:
        combined_data.extend(newsapi_data)
    if serpapi_data:
        combined_data.extend(serpapi_data)
    if twitter_data:
        combined_data.extend(twitter_data)

    if not combined_data:
        print("No data collected!")
        return pd.DataFrame()

    df = pd.DataFrame(combined_data)

    # Ensure that the type column is clean
    df['type'] = df['type'].str.lower().fillna("news")
    df.loc[df['url'].str.contains("twitter.com", case=False, na=False), 'type'] = "tweet"

    return df

In [24]:
# ===================== RUN SCRIPT =====================
df = collect_all_data(QUERY)
if not df.empty:
    output_path = "industry_insights_clean.csv"
    df.to_csv(output_path, index=False)
    print(f"Data collected and saved to: {output_path}")
else:
    print("No data to save.")

Data collected and saved to: industry_insights_clean.csv
