In [3]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from scipy.stats import linregress
import scipy.stats as st
from pprint import pprint 

# Import API key
from api_keys import weather_api_key

# Incorporated citipy to determine city based on latitude and longitude
from citipy import citipy

# Output File (CSV)
output_data_file = "output_data/cities.csv"
# output_data_file = "Homework_Output/cities.csv"

# Range of latitudes and longitudes
lat_range = (-90, 90)
lng_range = (-180, 180)

In [4]:
# List for holding lat_lngs and cities
lat_lngs = []
cities = []

# Create a set of random lat and lng combinations
lats = np.random.uniform(lat_range[0], lat_range[1], size=1500)
lngs = np.random.uniform(lng_range[0], lng_range[1], size=1500)
lat_lngs = zip(lats, lngs)

# Identify nearest city for each lat, lng combination
for lat_lng in lat_lngs:
    city = citipy.nearest_city(lat_lng[0], lat_lng[1]).city_name
    
    # If the city is unique, then add it to a our cities list
    if city not in cities:
        cities.append(city)

# Print the city count to confirm sufficient count
len(cities)

651

In [5]:
print(cities)

['cape town', 'najran', 'castro', 'severo-kurilsk', 'ushuaia', 'hobart', 'albany', 'jamestown', 'geraldton', 'dunedin', 'khatanga', 'tamandare', 'bluff', 'shadegan', 'high level', 'puerto ayora', 'shingu', 'nizhneyansk', 'laramie', 'ahipara', 'tevriz', 'ereymentau', 'bochnia', 'taolanaro', 'krasnyy chikoy', 'bambous virieux', 'hilo', 'dauphin', 'new norfolk', 'flinders', 'vaitupu', 'nome', 'atuona', 'port elizabeth', 'barrow', 'lebu', 'illoqqortoormiut', 'asfi', 'rikitea', 'cochrane', 'vaini', 'louisbourg', 'trairi', 'upernavik', 'mataura', 'naze', 'broome', 'el carrizo', 'fevralsk', 'praya', 'saldanha', 'saint george', 'boa vista', 'fort morgan', 'saleaula', 'kuche', 'bud', 'mahebourg', 'algiers', 'punta arenas', 'victoria', 'talakan', 'bredasdorp', 'dikson', 'tautira', 'tiksi', 'belushya guba', 'busselton', 'kruisfontein', 'ponta do sol', 'fort nelson', 'mar del plata', 'grand gaube', 'otane', 'alugan', 'fort-shevchenko', 'ostrovnoy', 'oloron-sainte-marie', 'katsina', 'hasaki', 'barr

In [6]:
# Base url for api
url = "http://api.openweathermap.org/data/2.5/weather?"

# Create lists to store response data
weather_data = []
temp = []
humidity = []
cloud = []
wind = []
country = []
date = []
city_name = []

# Output record Counter and Total count
record= 1
total_record = len(cities)

# Units to convert Kelvin to Farenheight 
units = 'imperial'

# Print Intial Data Log Statement
print(f"Beginning Data Retrieval")
print(f"-------------------------------")

# Create a loop to go through all the cities & perform the requests for data on each entry
for city in cities:

    query_url = f"{url}appid={weather_api_key}&units={units}&q="

    response = requests.get(query_url + city).json()
#     print(response)
    try:
        city_name.append(response['name'])
        temp.append(response['main']['temp_max'])
        humidity.append(response['main']['humidity'])
        cloud.append(response['clouds']['all'])
        wind.append(response['wind']['speed'])
        country.append(response['sys']['country'])
        date.append(response['dt'])
        
        print(f"Processing Record {record} of {total_record}  | {city}" )
        
#       Add to increase record count by one for each city 
        record += 1
    except:
         print("City not found. Skipping...") 
        
# Print Ending Data Log Statement 
print(f"-------------------------------")
print(f"Data Retrieval Complete")
print(f"-------------------------------")

Beginning Data Retrieval
-------------------------------
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Skipping...
City not found. Ski

KeyboardInterrupt: 

In [None]:
# Create a DataFrame containing 'City ', 'Country', 'Latitude', 'Longitude',  'Temperature', 'Humidity', 'Cloudiness', 'Wind Speed', 'Date'
weather_data = pd.DataFrame(( city_name, country, lats, lngs, temp, humidity, cloud, wind, date), 
                            index=('City ', 'Country', 'Latitude', 'Longitude',  'Temperature', 'Humidity', 'Cloudiness', 'Wind Speed', 'Date'), dtype="int").T
# Display DataFrame 


In [None]:
weather_data.dtypes

In [None]:
# Drop any rows containing missing values
weather_clean = weather_data.dropna(how='any')
weather_clean

In [None]:
weather_clean.describe()

In [None]:
# Find any cities with humidty of greater than 100, if none skip this step
weather_cities = weather_clean.loc[(weather_clean['Humidity'] >100)]
weather_cities

In [None]:
weather_clean['Latitude'] = pd.to_numeric(weather_clean['Latitude'], downcast="integer")
weather_clean['Temperature'] = pd.to_numeric(weather_clean['Temperature'], downcast="integer")
weather_clean['Humidity'] = pd.to_numeric(weather_clean['Humidity'], downcast="integer")
weather_clean['Cloudiness'] = pd.to_numeric(weather_clean['Cloudiness'], downcast="integer")

weather_clean['Longitude'] = pd.to_numeric(weather_clean['Longitude'], downcast="integer")

In [None]:
weather_clean['Wind Speed'] = pd.to_numeric(weather_clean['Wind Speed'], downcast="integer")
weather_clean.dtypes

In [None]:
# Create individual scatter plots for temperature, humidty, cloudiness, and wind speed against latitude data

# Create scatter plot of temp vs latitude
plt.scatter(weather_clean['Latitude'], weather_clean['Temperature'], marker='o')

plt.title('Temperature (F) vs Latitude (11/05/2020)')
plt.xlabel('Latitude')
plt.ylabel('Max Temperature (F)')
plt.grid(True)
plt.savefig("lat_vs_temp.png")
plt.show()


The scatter plot below is displaying all the individual cities from the data table, use column names to populate the data for each axis, added labels and title
The results don't seem to highlight any sort of correlation without further analysis of the data

In [None]:
# Create scatter plot of humidity vs latitude

plt.scatter(weather_clean['Latitude'], weather_clean['Humidity'], marker='o')

plt.title('Temperature (F) vs Humidity (%) (11/05/2020)')
plt.xlabel('Latitude')
plt.ylabel('Humidity (%)')
plt.grid(True)
plt.savefig("lat_vs_humidity.png")
plt.show()



The scatter plot below is displaying all the individual cities from the data table, use column names to populate the data for each axis, added labels and title
The results don't seem to highlight any sort of correlation without further analysis of the data

In [None]:
# Create scatter plot of cloudiness vs latitude

plt.scatter(weather_clean['Latitude'], weather_clean['Cloudiness'], marker='o')

plt.title('Cloudiness (%) vs Latitude (11/05/2020)')
plt.xlabel('Latitude')
plt.ylabel('Cloudiness (%)')
plt.grid(True)
plt.savefig('lat_vs_cloudiness.png')
plt.show()



The scatter plot below is displaying all the individual cities from the data table, use column names to populate the data for each axis, added labels and title
The results don't seem to highlight any sort of correlation without further analysis of the data

In [None]:
# Create scatter plot of wind speed vs latitude

plt.scatter(weather_clean['Latitude'], weather_clean['Wind Speed'], marker='o')

plt.title('Wind Speed (mph) vs Latitude (11/05/2020)')
plt.xlabel('Latitude')
plt.ylabel('Wind Speed (mph)')
plt.grid(True)
plt.savefig('lat_vs_wind.png')
plt.show()


The scatter plot below is displaying all the individual cities from the data table, use column names to populate the data for each axis, added labels and title
The results don't seem to highlight any sort of correlation without further analysis of the data, there are possibly some outliers in this data which might skew the results

In [None]:
# Run a linear regression for each of the above conditions
# Create Data Tables for northern and southern hemispheres 
northern_hemi = weather_clean.loc[(weather_clean['Latitude'] >=0)]
northern_hemi.dtypes

In [None]:
# northern_hemi['Latitude'] = pd.to_numeric(northern_hemi['Latitude'], downcast="integer")

In [None]:
southern_hemi = weather_clean.loc[(weather_clean['Latitude'] <0)]
southern_hemi.dtypes


In [None]:
# Define Data
x_values = northern_hemi['Latitude']
y_values = northern_hemi['Temperature']

# Create Scatter plot
plt.ylabel('Temperature (F)')
plt.xlabel('Latitude')
plt.title('Temperature (F) vs Latitude in the Northern Hemishpere')
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.grid(True)
plt.annotate(line_eq,(0,20),fontsize=15,color="red")


# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Print results for r-squared and p-value
print(f"The r-squared is: {rvalue**2}")
print(f'P-value: {pvalue}')
plt.savefig('Homework_Output/temp_north.png')

print(line_eq)
plt.show()

# Define Data
x_values = southern_hemi['Latitude']
y_values = southern_hemi['Temperature']

# Create Scatter plot
plt.ylabel('Temperature (F)')
plt.xlabel('Latitude')
plt.title('Temperature (F) vs Latitude in the Southern Hemishpere')
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.grid(True)
plt.annotate(line_eq,(-75,35),fontsize=15,color="red")

# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Print results for r-squared and p-value
print(f"The r-squared is: {rvalue**2}")
print(f'P-value: {pvalue}')

# Check correlation between the data sets 
correlation = st.pearsonr(x_values, y_values)

# Save image copy 
plt.savefig('Homework_Output/temp_south.png')

# Display linear equation on plot
print(line_eq)

# Populate plots and display 
plt.show()

# Displays correlation between the data sets
print(f"The correlation between both factors is {round(correlation[0],2)}")


When breaking the set of data into Hemispheres North and South it does little to change the plots after seperting the data points. With r-squared values close to zero in both plots, its hard to draw any solid conclusions from the data aside from a lack of correlation. Looking at the datapoints you can see that all latitudes have about the same range of temperatures regardless of latitude, which is surprising. 

In [None]:
# Define Data
x_values = northern_hemi['Latitude']
y_values = northern_hemi['Humidity']

# Create Scatter plot
plt.ylabel('Humidity (%)')
plt.xlabel('Latitude')
plt.title('Humidity (%) vs Latitude in the Northern Hemishpere')
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0,50),fontsize=15,color="red")

# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
print(f"The r-squared is: {rvalue**2}")
print(f'P-value: {pvalue}')

# Save image copy 
plt.savefig('Homework_Output/humid_north.png')

# Display linear equation on plot
print(line_eq)
plt.show()

# Define Data
x_values = southern_hemi['Latitude']
y_values = southern_hemi['Humidity']

# Create Scatter plot
plt.ylabel('Humidity (%)')
plt.xlabel('Latitude')
plt.title('Humidity (%) vs Latitude in the Southern Hemishpere')
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(-50, 25),fontsize=15,color="red")

# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Print results for r-squared and p-value
print(f"The r-squared is: {rvalue**2}")
print(f'P-value: {pvalue}')

# Save image copy 
plt.savefig('Homework_Output/humid_south.png')

# Display linear equation on plot
print(line_eq)

# Populate plots and display 
plt.show()

# Displays correlation between the data sets
correlation = st.pearsonr(x_values, y_values)
print(f"The correlation between both factors is {round(correlation[0],2)}")

When breaking the set of data into Hemispheres North and South it does little to change the plots after seperting the data points. With r-squared values close to zero in both plots, its hard to draw any solid conclusions from the data aside from a lack of correlation. Looking at the datapoints you can see that all latitudes have about the same range of humidty values  regardless of latitude.

In [None]:
# Define Data
x_values = northern_hemi['Latitude']
y_values = northern_hemi['Cloudiness']

# Create Scatter plot
plt.ylabel('Cloudiness (%)')
plt.xlabel('Latitude')
plt.title('Cloudiness (%) vs Latitude in the Northern Hemishpere')
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0,25),fontsize=15,color="red")

# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Print results for r-squared and p-value
print(f"The r-squared is: {rvalue**2}")
print(f'P-value: {pvalue}')

# Save image copy 
plt.savefig('Homework_Output/cloud_north.png')

# Display linear equation on plot
print(line_eq)

# Populate plots and display 
plt.show()

# Define Data
x_values = southern_hemi['Latitude']
y_values = southern_hemi['Cloudiness']

# Create Scatter plot
plt.ylabel('Cloudiness (%)')
plt.xlabel('Latitude')
plt.title('Cloudiness (%) vs Latitude in the Southern Hemishpere')
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(-50,25),fontsize=15,color="red")

# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))


# Print results for r-squared and p-value, print linear equation on plot
print(f"The r-squared is: {rvalue**2}")
print(f'P-value: {pvalue}')
print(line_eq)

# Save image copy 
plt.savefig('Homework_Output/cloud_south.png')


# Populate plots and display 
plt.show()

# Displays correlation between the data sets
correlation = st.pearsonr(x_values, y_values)
print(f"The correlation between both factors is {round(correlation[0],2)}")


When breaking the set of data into Hemispheres North and South it does little to change the plots after seperting the data points. With r-squared values close to zero in both plots, its hard to draw any solid conclusions from the data aside from a lack of correlation. Looking at the datapoints you can see that all latitudes have about the same range of cloudiness values  regardless of latitude.

In [None]:
# Define Data
x_values = northern_hemi['Latitude']
y_values = northern_hemi['Wind Speed']

# Create Scatter plot
plt.ylabel('Wind Speed (mph)')
plt.xlabel('Latitude')
plt.title('Wind Speed (mph) vs Latitude in the Northern Hemishpere')
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0,25),fontsize=15,color="red")

# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))


# Print results for r-squared and p-value
print(f"The r-squared is: {rvalue**2}")
print(f'P-value: {pvalue}')

# Save image copy 
plt.savefig('Homework_Output/wind_north.png')


# Display linear equation on plot
print(line_eq)

# Populate plots and display 
plt.show()

# Define Data
x_values = southern_hemi['Latitude']
y_values = southern_hemi['Wind Speed']

# Create Scatter plot
plt.ylabel('Wind Speed (mph)')
plt.xlabel('Latitude')
plt.title('Wind Speed (mph) vs Latitude in the Southern Hemishpere')
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(-45,20),fontsize=15,color="red")

# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Print results for r-squared and p-value
print(f"The r-squared is: {rvalue**2}")
print(f'P-value: {pvalue}')


# Save image copy 
plt.savefig('Homework_Output/humid_south.png')

# Display linear equation on plot
print(line_eq)

# Populate plots and display 
plt.show()

# Displays correlation between the data sets
correlation = st.pearsonr(x_values, y_values)
print(f"The correlation between both factors is {round(correlation[0],2)}")

When breaking the set of data into Hemispheres North and South it does little to change the plots after seperting the data points. With r-squared values close to zero in both plots, its hard to draw any solid conclusions from the data aside from a lack of correlation. Looking at the datapoints you can see that all latitudes have about the same range of wind speed values  regardless of latitude. 