Data Fetching for tourist attraction in USA

In [42]:
import requests
import pandas as pd

# Overpass API endpoint
url = "https://overpass-api.de/api/interpreter"

# Query: Fetch attractions in the United States
query = """
[out:json];
area[name="United States"]->.searchArea;
node["tourism"="attraction"](area.searchArea);
out body;
"""

# Send request to Overpass API
response = requests.post(url, data={"data": query})
data = response.json()

# Prepare a list to store data for pandas DataFrame
attractions = []

# Parse the data
for element in data['elements']:
    tags = element.get('tags', {})
    name = tags.get('name', 'Unknown')
    tourism_type = tags.get('tourism', 'Unknown')
    lat = element.get('lat', 'N/A')
    lon = element.get('lon', 'N/A')
    description = tags.get('description', 'N/A')
    website = tags.get('website', 'N/A')
    opening_hours = tags.get('opening_hours', 'N/A')
    
    # Append to the attractions list
    attractions.append({
        "Name": name,
        "Type": tourism_type,
        "Latitude": lat,
        "Longitude": lon,
        "Description": description,
        "Website": website,
        "Opening Hours": opening_hours
    })

# Create a DataFrame
df = pd.DataFrame(attractions)

# Display the DataFrame to the user
# print(df)


KeyboardInterrupt: 

In [5]:
# Save to CSV (optional)
df.to_csv("/Users/rahul/Downloads/attractions.csv", index=False, encoding="utf-8")
print("Data saved to attractions.csv")


Data saved to attractions.csv


Fetching news artcles of tesla

In [2]:
from newsapi import NewsApiClient
import pandas as pd
import datetime
import time


In [12]:
from newsapi import NewsApiClient
import pandas as pd
import datetime
import numpy as np
from dateutil.relativedelta import relativedelta
from datetime import date

newsapi = NewsApiClient(api_key='your api key')

# We'll gather news about Tesla in a loop for each month (approx),
# because NewsAPI often doesn't allow large date ranges in one request.
# Alternatively, you can tailor this to your timeframe.

all_articles = []
end_fetch_date = datetime.datetime.today()
current_date = current_date = datetime.datetime(2017, 10, 1)
delta = datetime.timedelta(days=15)  # fetch articles in monthly increments

while current_date < end_fetch_date:
    next_date = current_date + delta
    from_param = current_date.strftime('%Y-%m-%d')
    to_param = next_date.strftime('%Y-%m-%d')
    
    try:
        articles = newsapi.get_everything(
            q='(tesla OR TSLA)',
            sources = 'bloomberg,business-insider,financial-post,fortune,the-wall-street-journal,reuters,bbc-news',
            from_param=from_param,
            to=to_param,
            language='en',
            sort_by='relevancy',
            page_size=100
        )
        if articles.get('articles'):
            all_articles.extend(articles['articles'])
    except Exception as e:
        print(f"Error fetching articles from {from_param} to {to_param}: {e}")
    
    current_date = next_date
    time.sleep(1)
print(f"Total articles fetched: {len(all_articles)}")

Total articles fetched: 17439


In [9]:
# Convert articles to DataFrame
news_df = pd.DataFrame(all_articles)

In [11]:
news_df.to_csv("/Users/rahul/Downloads/news_data_tesla_15.csv", index=False, encoding="utf-8")

In [1]:
from newsapi import NewsApiClient
from newspaper import Article
import datetime
import pandas as pd
import time

# Instantiate NewsAPI client
# NOTE: Replace with your own key. 
# The key in your example is presumably for demonstration only.
newsapi = NewsApiClient(api_key='**')

# Dictionary to hold companies and search queries (if you want more, expand here)
companies = {
    'tsla': '(tesla OR TSLA)',
}

# List to hold all articles from repeated fetch calls
all_articles = []

# Set the start date, end date, and iteration increment
current_date = datetime.datetime(2017, 10, 1)
end_date = datetime.datetime.today()
delta = datetime.timedelta(days=7)  # fetch in weekly intervals

# Choose sources (some are known to be paywalled, e.g. WSJ)
# If they're all paywalled, you can remove them or keep them if you want headlines.
sources = 'the-wall-street-journal,bloomberg,business-insider,financial-post,fortune'

# Output CSV path
# output_dir = "/PATH/TO/OUTPUT/tsla_news_fetch.csv"

def scrape_article(url):
    """
    Attempt to scrape the full article text from the given URL using newspaper3k.
    Falls back to returning an empty string on errors or paywalls.
    """
    try:
        article = Article(url)
        article.download()
        article.parse()
        # Once parsed, we can fetch the .text
        full_text = article.text.strip()
        return full_text
    except Exception as e:
        print(f"Error scraping article from {url} - {e}")
        # Return empty string so we can try fallback from NewsAPI
        return ''

# Main loop to fetch data month by month, or week by week, etc.
while current_date < end_date:
    next_date = current_date + delta
    from_param = current_date.strftime('%Y-%m-%d')
    to_param = next_date.strftime('%Y-%m-%d')

    print(f"Fetching articles from {from_param} to {to_param}...")
    
    try:
        # This example only does Tesla, 
        # but you can iterate over companies dict if you want multiple
        articles_response = newsapi.get_everything(
            q=companies['tsla'],   # search string
            sources=sources,
            from_param=from_param,
            to=to_param,
            language='en',
            sort_by='relevancy',
            page_size=100
        )

        # Ensure we have some articles to process
        if articles_response["status"] == "ok" and articles_response["totalResults"] > 0:
            for i, article in enumerate(articles_response["articles"]):
                # In some cases, NewsAPI returns "[Removed]" for the title
                if article['title'] and article['title'] != '[Removed]':

                    # Attempt to get the full text from the URL
                    scraped_content = scrape_article(article['url'])
                    
                    # If the scraped_content is empty or paywalled, fallback to short content
                    if not scraped_content:
                        # Usually article['content'] is a truncated snippet from NewsAPI
                        # article['description'] is also a short snippet
                        # You can combine them or just pick one
                        scraped_content = article.get('title')+ ' ' + article.get('content')+' ' + article.get('description')

                    article_data = {
                        "Symbol": 'TSLA',  # If you iterate over multiple, use the key from your dict
                        "Title": article.get('title', ''),
                        "Content": scraped_content,
                        "Description": article.get('description', ''),
                        "Published At": article.get('publishedAt', ''),
                        "URL": article.get('url', ''),
                        "Source": article['source'].get('name', ''),
                        "Author": article.get('author', '')
                    }

                    all_articles.append(article_data)
                    print(f"  -> Appended article #{i+1} from {article_data['Source']}")
        
    except Exception as e:
        print(f"Error fetching articles from {from_param} to {to_param}: {e}")

    # Update current_date to move to the next interval
    current_date = next_date

    # Sleep a bit to respect rate limits & not overload servers
    time.sleep(1)

# Convert our list of dicts into a DataFrame
df = pd.DataFrame(all_articles)


Fetching articles from 2017-10-01 to 2017-10-08...
Error fetching articles from 2017-10-01 to 2017-10-08: {'status': 'error', 'code': 'apiKeyInvalid', 'message': 'Your API key is invalid or incorrect. Check your key, or go to https://newsapi.org to create a free API key.'}


KeyboardInterrupt: 

In [38]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

In [18]:
output_dir="/Users/rahul/Downloads/news_data_tesla_sample.csv"
# Save the DataFrame to CSV
df.to_csv(output_dir, index=False, encoding="utf-8")

print(f"Total articles fetched: {len(all_articles)}")
print(f"CSV saved to {output_dir}")


Total articles fetched: 57
CSV saved to /Users/rahul/Downloads/news_data_tesla_sample.csv
