# Weather Analysis

In [None]:
# Dependencies
import sys
import numpy as np
import pandas as pd
from config import api_key
import json
import requests
import time
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [None]:
# install citypy
sys.path.append('./lib/python3.8/site-packages')

In [None]:
from citipy import citipy

# test the citipy api
city = citipy.nearest_city(36.81964369493615, 11.053632348043188)
print(f"City Name: {city.city_name}, Country Code: {city.country_code}")

## API Calls:

### CitiPy

In [None]:
# Randomly select at least 500 unique (non-repeat) cities based on latitude and longitude.

# Define min and max longitudes and latitudes. 
# Google says Latitudes range from -90 to 90, and longitudes range from -180 to 180
minlat = -90
maxlat = 90

minlon = -180
maxlon = 180
i = 0

random_cities = {i:["city_name", "country_code", "lat", "lon"]}


In [None]:
# loop through latitude and longitude range to pick nearest city and append to dictionary

for lat in np.arange(minlat, maxlat, 7.2):
    for lon in np.arange(minlon, maxlon, 6.4):
        city = citipy.nearest_city(lat, lon)
        cityName = city.city_name
        countryCode = city.country_code
        i = i + 1
        random_cities[i] = [cityName, countryCode, lat, lon]
        

In [None]:
# move into a Dataframe for some checks and duplicates clean up
random_cities_df = pd.DataFrame.from_dict(random_cities, orient='index', columns=random_cities[0])

## remove first row after making it a header
random_cities_df = random_cities_df.iloc[1: , :]
random_cities_df["city_name"].count()

In [None]:
## remove duplicates

random_cities_df = random_cities_df.drop_duplicates(subset=['city_name'])
random_cities_df["city_name"].count()

In [None]:
random_cities_df.head()

In [None]:
# check for any NaN
random_cities_df = random_cities_df.dropna(how='any')

In [None]:
random_cities_df["city_name"].count()

### OpenWeather API

In [None]:
## Perform a weather check on each of the cities using a series of successive API calls.
## This code was only used to do exploratory api calls

## set up base url for historical api
# url = f'http://api.openweathermap.org/data/2.5/onecall/timemachine?lat=-90.0&lon=-180.0&dt=1634445108&appid={api_key}'

## Let's try out some calls
# weather_response = requests.get(url)
# weather_json = weather_response.json()
# print(f"The weather API responded with: {weather_json}.")


In [None]:
## try out some json traversal
# print(f"The Temperature for Vaini is : {weather_json['current']['temp']}")

In [None]:
base_url = 'http://api.openweathermap.org/data/2.5/onecall/timemachine?lat='


In [None]:
# loop through the latitude and longitude combinations and get a weather json for each
# Include a print log of each city as it's being processed with the city number and city name
# add 1 second wait time

# get the time code now
now = int (time.time())
print(now)

In [None]:
# !!! this code will take 15min to run - you can grab a cup of something

temp = []
humidity = []
cloudiness = []
wind_speed = []

for i in range(len(random_cities_df['city_name'])):
    citylog = random_cities_df.iloc[i, 0]
    lat = random_cities_df.iloc[i, 2]
    lon = random_cities_df.iloc[i, 3]
    query = f'{base_url}{lat}&lon={lon}&dt={now}&appid={api_key}&units=imperial'
    
    try: 
        response = requests.get(query)
        response_json = response.json()
        temp.append(response_json['current']['temp'])
        humidity.append(response_json['current']['humidity'])
        cloudiness.append(response_json['current']['clouds'])
        wind_speed.append(response_json['current']['wind_speed'])
        print(f'{i} - {citylog} processed successfully')
        
        # wait a second
        time.sleep(1)
    
    except:
        print(f'{i} - {citylog} failed')
        pass
    
    

In [None]:
# Consolidate the data into a dataframe 
latitudes = random_cities_df['lat']
longitudes = random_cities_df['lon']
cities = random_cities_df['city_name']

cities_weather = {
    'city': cities,
    'latitude': latitudes,
    'longitude': longitudes,
    'temperature': temp,
    'humidity': humidity,
    'cloudiness': cloudiness,
    'wind_speed': wind_speed
}
cities_weather_df = pd.DataFrame(cities_weather)
cities_weather_df.head()

In [None]:
# and save to csv file to cut dependency on api rate restrictions
cities_weather_df.to_csv("cities_weather_df.csv")

## Scatter Plots:

### Scatter plot for Temperature (F) vs. Latitude

In [None]:

plt.scatter(latitudes, temp, marker="o", facecolors="red", edgecolors='orange')

plt.title("Temperature (F) vs. Latitude")
plt.xlabel("Latitude")
plt.ylabel("Temperature (F)")

plt.savefig("temperature_latitude.png")
plt.show()

#### Explanation
The scatter follows a clear concave shape centered around the 0 latitude, the equator. The closer to the equator the higher the temperature.

In [None]:
# scatter plot for Humidity (%) vs. Latitude
plt.scatter(latitudes, humidity, marker="^", facecolors="blue", edgecolors='black')

plt.title("Humidity (%) vs. Latitude")
plt.xlabel("Latitude")
plt.ylabel("Humidity (%)")

plt.savefig("humidity_latitude.png")
plt.show()

#### Explanation
Latitude does not seem to have an effect on the humidity level but rather the variance of humidity levels. On the equator, most humidity values are clustered on the higher range. 25 degrees above and below the equator tend to see very high to very low humidity levels.

In [None]:
# scatter plot for Cloudiness (%) vs. Latitude
plt.scatter(latitudes, cloudiness, marker="2", facecolors="gray", edgecolors='black')

plt.title("Cloudiness (%) vs. Latitude")
plt.xlabel("Latitude")
plt.ylabel("Cloudiness (%)")

plt.savefig("cloudiness_latitude.png")
plt.show()

#### Explanation
The markers are concentrated in 3 areas: On the equator, we mostly have high cloud coverage. On the 25degrees line above and below the equator have mostly low cloudiness

In [None]:
# scatter plot for Wind Speed (mph) vs. Latitude
plt.scatter(latitudes, wind_speed, marker=4, facecolors="black", edgecolors='black')

plt.title("Wind Speed (mph) vs. Latitude")
plt.xlabel("Latitude")
plt.ylabel("Wind Speed (mph)")

plt.savefig("windspeed_latitude.png")
plt.show()

#### Explanation
The Wind speed data is more clustered around low windspeed in the northern hemisphere with outliers around the 50 degree line above and below the equator showing high wind speeds. 

## Linear Regressions:

In [None]:
# Split the world in two
northern_cities_df = cities_weather_df.loc[cities_weather_df['latitude'] >= 0, :]
southern_cities_df = cities_weather_df.loc[cities_weather_df['latitude'] <= 0, :]

In [None]:
# Northern Hemisphere - Temperature (F) vs. Latitude
x_values = northern_cities_df['latitude']
y_values = northern_cities_df['temperature']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values.astype(float), y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, facecolors='orange')
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel('Latitude, Northern Hemisphere')
plt.ylabel('Temperature (F)')
print(f"The r-squared is: {rvalue**2}")

plt.savefig("temperature_latitude_north.png")
plt.show()

In [None]:
# Southern Hemisphere - Temperature (F) vs. Latitude
x_values = southern_cities_df['latitude']
y_values = southern_cities_df['temperature']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values.astype(float), y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values,facecolors='orange')
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(-60,-20),fontsize=15,color="red")
plt.xlabel('Latitude, Southern Hemisphere')
plt.ylabel('Temperature (F)')
print(f"The r-squared is: {rvalue**2}")
plt.savefig("temperature_latitude_south.png")
plt.show()


#### Analysis
Both linear regressions prove a strong correlation between latitude and temperature with high r-squared values.

In [None]:
# Northern Hemisphere - Humidity (%) vs. Latitude
x_values = northern_cities_df['latitude']
y_values = northern_cities_df['humidity']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values.astype(float), y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, facecolors='blue')
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(50,20),fontsize=15,color="red")
plt.xlabel('Latitude, Northern Hemisphere')
plt.ylabel('Humidity (%)')
print(f"The r-squared is: {rvalue**2}")
plt.savefig("humidity_latitude_north.png")
plt.show()

In [None]:
# Southern Hemisphere - Humidity (%) vs. Latitude
x_values = southern_cities_df['latitude']
y_values = southern_cities_df['humidity']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values.astype(float), y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, facecolors='blue')
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(-80,20),fontsize=15,color="red")
plt.xlabel('Latitude, Southern Hemisphere')
plt.ylabel('Humidity (%)')
print(f"The r-squared is: {rvalue**2}")
plt.savefig("humidity_latitude_south.png")
plt.show()

#### Analysis
Both r-squared values are quite low, which means Humidity is not affected by latitude.

In [None]:
# Northern Hemisphere - Cloudiness (%) vs. Latitude
x_values = northern_cities_df['latitude']
y_values = northern_cities_df['cloudiness']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values.astype(float), y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, facecolors='gray')
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(50,40),fontsize=15,color="red")
plt.xlabel('Latitude, Northern Hemisphere')
plt.ylabel('Cloudiness (%)')
print(f"The r-squared is: {rvalue**2}")
plt.savefig("cloudiness_latitude_north.png")
plt.show()

In [None]:
# Southern Hemisphere - Cloudiness (%) vs. Latitude
x_values = southern_cities_df['latitude']
y_values = southern_cities_df['cloudiness']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values.astype(float), y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, facecolors='gray')
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(-80,15),fontsize=15,color="red")
plt.xlabel('Latitude, Southern Hemisphere')
plt.ylabel('Cloudiness (%)')
print(f"The r-squared is: {rvalue**2}")
plt.savefig("cloudiness_latitude_south.png")
plt.show()

#### Analysis
As with Humidity, Cloudiness cannot be explained by latitdue. At least not by latitude only. Proximity to bodies and weather systems might do a better job.

In [None]:
# Northern Hemisphere - Wind Speed (mph) vs. Latitude
x_values = northern_cities_df['latitude']
y_values = northern_cities_df['wind_speed']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values.astype(float), y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, facecolors='black')
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(5,35),fontsize=15,color="red")
plt.xlabel('Latitude, Northern Hemisphere')
plt.ylabel('Wind Speed (mph)')
print(f"The r-squared is: {rvalue**2}")
plt.savefig("windspeed_latitude_north.png")
plt.show()

In [None]:
# Southern Hemisphere - Wind Speed (mph) vs. Latitude
x_values = southern_cities_df['latitude']
y_values = southern_cities_df['wind_speed']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values.astype(float), y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values, facecolors='black')
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(-50,40),fontsize=15,color="red")
plt.xlabel('Latitude, Southern Hemisphere')
plt.ylabel('Wind Speed (mph)')
print(f"The r-squared is: {rvalue**2}")
plt.savefig("windspeed_latitude_south.png")
plt.show()

#### Analysis
Usually pressure systems and landscape affect wind speed, not latitude. In fact, both linear regressions show a low r-squared and a weak correlation


### Limitations:
The data collected from OpenWeather is the current weather measured today. Considering we are running this analysis in Fall, there will be difference between the Northern Hemisphere and the Southern Hemisphere. Also, the api was called at one point in the day. To get an accurate measurement, we shoould have collected data for minimum and maximum temperatures in the day. 

### Final Analysis
Having performed the same analysis for all 4 dependent variables, temperature, humidity, cloudiness and wind speed, temperature is the only one showing a clear positive correlation to the measure of proximity to the equator. So in other words "Duh, it does get hotter, and I can prove it!" :)