## Project Overview: 
### Implement a system to collect data from various sources (e.g., APIs, web scraping) and store it. Here we are collecting data related to WEATHER CONDITIONS.

## Objectives:
### Learn to extract data from different sources.

## Skills: 
### API usage, Web scraping.

## Tools & Technologies: 
### Python (requests, BeautifulSoup), Pandas.

## Part 1 - Data Collection using Web scraping

### Example 1: Weather data - Web scraping

In [1]:
#Importing required libraries
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

In [2]:
# URL of the weather page
#global url
url = 'https://weather.com/en-IN/weather/tenday/l/2ee1b1ff0937b45eb8371dd08a6f8edc584f9b0968440eb89999e49fa185c55e'

response = requests.get(url)

In [3]:
#response.content - bytes will be the type of response and as no decoding is involved, as We are extracting raw data 

In [4]:
#response.text - If we use response.text then type of response will be str as this contains the content of the response as unicode string

In [5]:
#Parse HTML Content
soup = bs(response.content, "html.parser")
#Printing only first occurence of meta tag here in the head tag 
print(soup.head.meta.prettify())

<meta charset="utf-8" data-react-helmet="true"/>



In [6]:
#Saving the HTML as text file
with open("10Day_Weather.txt",'w') as file:
    file.write(str(soup.prettify()))
print("File got created successfully")

File got created successfully


In [7]:
# Function to scrape weather data
def scrape_weather_data(url):
    # Send a request to fetch the HTML content
    response = requests.get(url)
    response.raise_for_status()  # Check for request errors

    # Parse the HTML content
    soup = bs(response.text, 'html.parser')

    # Initialize a list to hold the weather data
    weather_data = []

    # Find the weather table on the page
    # Note: The HTML structure may vary; adjust selectors accordingly
    weather_table = soup.find('div', class_='DailyForecast--DisclosureList--nosQS')

    # Iterate over each day of the month
    for day in weather_table.find_all('details'):
        date = day.find('span', class_='DailyContent--daypartDate--3VGlz').text
        temp = day.find('span', class_='DailyContent--temp--1s3a7').text
        rain = day.find('span', class_='DailyContent--value--1Jers').text
        wind = day.find('span', class_='Wind--windWrapper--3Ly7c DailyContent--value--1Jers DailyContent--windValue--JPpmk').text
        description = day.find('p', class_='DailyContent--narrative--3Ti6_').text
        details_list = day.find_all('span', class_='DetailsTable--value--2YD0-')
        details = [span.text for span in details_list]
        # humidity = day.find('span', class_='DetailsTable--value--2YD0-').text
        # uvindex = day.find('span', class_='DetailsTable--value--2YD0-').text
        # sunrise = day.find('span', class_='DetailsTable--value--2YD0-').text
        # sunset = day.find('span', class_='DetailsTable--value--2YD0-').text

        # Append the data to the list
        weather_data.append({
            'Date': date,
            'Avg Temp': temp,
            'Rain %': rain,
            'Wind': wind,
            'Humidity': details[0],
            'UV Index': details[1],
            'Sunrise': details[2],
            'Sunset': details[3],
            'Overview': description
        })

    return weather_data


In [8]:
# Function to save the data to a CSV file
def save_to_csv(data, filename='weather_data.csv'):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

In [9]:
# Main function to execute the scraping and saving
def main():
    data = scrape_weather_data(url)
    save_to_csv(data)
    print(f"Weather data has been saved to 'weather_data.csv'")

if __name__ == '__main__':
    main()

Weather data has been saved to 'weather_data.csv'


In [10]:
webscrape_data = pd.read_csv('weather_data.csv')
webscrape_data

Unnamed: 0,Date,Avg Temp,Rain %,Wind,Humidity,UV Index,Sunrise,Sunset,Overview
0,Sat 24,21°,4%,WSW 18 km/h,83%,0 of 11,22:16,10:20,Mostly cloudy. Low 21°C. Winds WSW at 15 to 25...
1,Sun 25,29°,7%,WSW 27 km/h,70%,Extreme,06:08,18:35,Mostly cloudy. High 29°C. Winds WSW at 15 to 3...
2,Mon 26,29°,24%,WSW 25 km/h,70%,9 of 11,06:08,18:34,Mostly cloudy. High 29°C. Winds WSW at 15 to 3...
3,Tue 27,29°,47%,WSW 22 km/h,70%,9 of 11,06:08,18:33,Thunderstorms developing in the afternoon. Sto...
4,Wed 28,29°,57%,WSW 21 km/h,71%,Extreme,06:08,18:33,Thunderstorms developing in the afternoon. Hig...
5,Thu 29,29°,69%,WSW 21 km/h,73%,Extreme,06:08,18:32,Thunderstorms developing in the afternoon. Hig...
6,Fri 30,29°,65%,WSW 21 km/h,72%,9 of 11,06:08,18:32,Thunderstorms developing in the afternoon. Hig...
7,Sat 31,28°,58%,W 20 km/h,72%,Extreme,06:08,18:31,Thunderstorms developing in the afternoon. Hig...
8,Sun 01,29°,15%,W 20 km/h,72%,Extreme,06:08,18:30,Mostly cloudy. High 29°C. Winds W at 15 to 25 ...
9,Mon 02,29°,60%,W 19 km/h,72%,Extreme,06:08,18:30,Thunderstorms developing in the afternoon. Hig...


## Part 2 - Data Collection from APIs

In [11]:
#I am collecting weather data from openmeteo weather API, Installing required libraries
import openmeteo_requests
import requests_cache
from retry_requests import retry

In [12]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

In [13]:
# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 12.9719,
	"longitude": 77.5937,
	"start_date": "2024-07-22",
	"end_date": "2024-08-05",
	"hourly": ["relative_humidity_2m", "rain", "wind_speed_100m", "direct_radiation"],
	"daily": ["weather_code", "temperature_2m_mean", "sunrise", "sunset"],
	"timezone": "Asia/Singapore"
}
responses = openmeteo.weather_api(url, params=params)
responses[0]

<openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x105ed3790>

In [14]:
# Process first location. Add a for-loop for multiple locations or weather models
# I have selected only one location which is Bangalore, India
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

Coordinates 12.970123291015625°N 77.56363677978516°E
Elevation 918.0 m asl
Timezone b'Asia/Singapore' b'+08'
Timezone difference to GMT+0 28800 s


In [15]:
# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_relative_humidity_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_rain = hourly.Variables(1).ValuesAsNumpy()
hourly_wind_speed_100m = hourly.Variables(2).ValuesAsNumpy()
hourly_direct_radiation = hourly.Variables(3).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["rain"] = hourly_rain
hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
hourly_data["direct_radiation"] = hourly_direct_radiation

In [16]:
hourly_dataframe = pd.DataFrame(data = hourly_data)
hourly_dataframe.head()

Unnamed: 0,date,relative_humidity_2m,rain,wind_speed_100m,direct_radiation
0,2024-07-21 16:00:00+00:00,83.299065,0.0,35.069984,0.0
1,2024-07-21 17:00:00+00:00,84.596428,0.0,38.723015,0.0
2,2024-07-21 18:00:00+00:00,85.101303,0.0,37.017033,0.0
3,2024-07-21 19:00:00+00:00,86.419762,0.0,38.292225,0.0
4,2024-07-21 20:00:00+00:00,87.500008,0.0,38.26852,0.0


In [17]:
# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_weather_code = daily.Variables(0).ValuesAsNumpy()
daily_temperature_2m_mean = daily.Variables(1).ValuesAsNumpy()
daily_sunrise = daily.Variables(2).ValuesAsNumpy()
daily_sunset = daily.Variables(3).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}
daily_data["weather_code"] = daily_weather_code
daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
daily_data["sunrise"] = daily_sunrise
daily_data["sunset"] = daily_sunset

In [18]:
daily_dataframe = pd.DataFrame(data = daily_data)
daily_dataframe.head()

Unnamed: 0,date,weather_code,temperature_2m_mean,sunrise,sunset
0,2024-07-21 16:00:00+00:00,51.0,22.959,0,0
1,2024-07-22 16:00:00+00:00,53.0,23.096498,0,0
2,2024-07-23 16:00:00+00:00,51.0,23.154831,0,0
3,2024-07-24 16:00:00+00:00,53.0,22.819414,0,0
4,2024-07-25 16:00:00+00:00,53.0,22.950666,0,0
