In [None]:
days_per_month = {1: 31, 2: 28, 3: 31, 4: 30,
                  5: 31, 6: 30, 7: 31, 8: 31,
                  9: 30, 10: 31, 11: 30, 12: 31}

link_format = "http://www.wunderground.com/history/airport/KNYC/{}/{}/{}/DailyHistory.html"
links = [link_format.format(year, month, day)
         for year in range(2013, 2016) # 2013 - 2015 inclusive
         for month in range(1, 13)     # 1 - 12 inclusive
         for day in range(1, days_per_month[month] + 1)]

print(len(links))
print("\n".join(links[:2]))

1095
http://www.wunderground.com/history/airport/KNYC/2013/1/1/DailyHistory.html
http://www.wunderground.com/history/airport/KNYC/2013/1/2/DailyHistory.html


In [None]:
import requests
import os.path

def download_file(link, name):
    if os.path.isfile(name):
        return
    file = open(name, 'w')
    r = requests.get(link)
    file.write(r.text)
    file.close()
for i, link in enumerate(links):
    if i % 50 == 0:
        print("Done with %d.." % i)
    download_file(link, "%d.html" % i)

Done with 0..
Done with 50..
Done with 100..
Done with 150..
Done with 200..
Done with 250..
Done with 300..
Done with 350..
Done with 400..


In [None]:
from bs4 import BeautifulSoup

with open("0.html") as fin:
    soup = BeautifulSoup(fin.read(), "html.parser")

In [None]:
all_as = soup.find_all('a')
for i in range(5):
    print(all_as[-i])
    print()

In [None]:
main_table = soup.find(id='historyTable')

In [None]:
rows = main_table.find_all('tr')
print(len(rows))
for i in range(3):
    print(rows[i])
    print()

In [None]:
row = rows[2]
for cell in row.find_all('td'):
    print(cell)
    print()
row_name = row.find_all('td')[0].text.strip()  # Get rid of extra whitespace
row_value = row.find_all('td')[1].text.strip()
print(row_name, ":", row_value)


In [None]:
for row in rows:
    #Only process the rows with 4 cells to eliminate heading rows, etc.
    if len(row.find_all('td')) == 4:
        row_name = row.find_all('td')[0].text.strip()
        row_value = row.find_all('td')[1].text.strip() 
        print(row_name, ":", row_value)    

In [None]:
fields = ['Mean Temperature', 'Max Temperature', 'Min Temperature',\
          'Dew Point', 'Average Humidity', 'Maximum Humidity',\
          'Minimum Humidity', 'Precipitation', 'Wind Speed',\
          'Max Wind Speed', 'Max Gust Speed']
def scrape_file(name):
    with open(name) as fin:
        soup = BeautifulSoup(fin.read(), "html.parser")
    data = {}
    for row in soup.find(id="historyTable").find_all("tr"):
        cells = row.find_all("td")
        if len(cells) == 4:
            name = cells[0].text.strip()
            if name in fields:
                data[name] = cells[1].text.split()[0].strip()   # Split to remove units
    return data
scrape_file("0.html")

In [None]:
import csv

csv_fields = ["Month", "Day", "Year"] + fields

with open("weather_data.csv", "w") as fout:
    writer = csv.DictWriter(fout, csv_fields)
    writer.writeheader()
    
    for i, link in enumerate(links):
        data = scrape_file("{}.html".format(i))
        url_parts = link.split("/")
        data["Month"] = int(url_parts[-3])
        data["Year"] = int(url_parts[-4])
        data["Day"] = int(url_parts[-2])
        
        writer.writerow(data)

In [None]:
import pandas as pd

data = pd.read_csv('weather_data.csv')

In [None]:
pd.options.display.max_rows = 7
data

In [None]:
data.columns = [name.lower().replace(" ", "_")
                for name in data.columns]

In [None]:
data["dew_point"]

In [None]:
print(data["dew_point"].mean())
data["dew_point"] * 5

In [None]:
print(len(data))
print(len(data.columns))
data.columns

In [None]:
data.shape
may_2015_data = data[(data.month == 5) & (data.year == 2015)]
may_2015_data[:5]
clean_data = data.convert_objects(convert_numeric=True)
print(clean_data.dtypes)
clean_data.precipitation.unique()



In [None]:
##Implementing KNN Algorithm

import pandas as pd

data = pd.read_csv('clean_weather_data.csv')
print(data.dtypes)
data[:5]

from sklearn.cross_validation import train_test_split

X = data.drop('Precipitation', axis=1)
y = data.Precipitation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor(n_neighbors=3)
knn_model.fit(X_train, y_train)
Out[3]:

    
knn_y = knn_model.predict(X_test)

knn_sum_squares = ((knn_y - y_test) ** 2).sum()
mean_sum_squares = ((y_test.mean() - y_test) ** 2).sum()

print("Average loss:", mean_sum_squares)
print("KNN loss:", knn_sum_squares)
print("Variation explained: ", 100 * (1 - knn_sum_squares / mean_sum_squares), "%", sep="")

print("R^2: ", knn_model.score(X_test, y_test))



In [None]:
##Modelling Linear Regression

from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

print(linear_model.score(X_test, y_test))

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

sns.plt.scatter(y_test, knn_model.predict(X_test))
sns.plt.title('KNN Prediction Analysis')
sns.plt.xlabel('Real Value')
sns.plt.ylabel('Prediction')
plt.show()

sns.plt.scatter(y_test, linear_model.predict(X_test))
sns.plt.title('Linear Regression Prediction Analysis')
sns.plt.xlabel('Real Value')
sns.plt.ylabel('Prediction')
plt.show()



In [None]:
##Visualizing using Bokeh

import bokeh.io, bokeh.plotting, bokeh.models
bokeh.io.output_notebook()

p = bokeh.plotting.figure()
p.circle(data.mean_temperature, data.dew_point)
bokeh.plotting.show(p)

q = bokeh.plotting.figure()
q.circle(data.mean_temperature, data.dew_point)
q.title = 'My Interactive Visualization'
q.xaxis.axis_label = 'Average Temperature'
q.yaxis.axis_label = 'Dew Point'
bokeh.plotting.show(q)

hist_plot = bokeh.plotting.figure(title = 'Histogram of Average Temperature')
hist, edges = np.histogram(data.mean_temperature, bins = 25)
hist_plot.quad(top = hist, bottom = 0, left = edges[:-1], right = edges[1:], line_color = "#000000")
hist_plot.xaxis.axis_label = "Average Temperature"
hist_plot.yaxis.axis_label = "Frequency"
bokeh.plotting.show(hist_plot)


line_data = np.linspace(data.mean_temperature.min(), data.mean_temperature.max())

first = bokeh.plotting.figure()
first.circle(data.mean_temperature, data.max_temperature)
first.line(line_data, line_data)
tab1 = bokeh.models.Panel(child = first, title = 'First Plot')

second = bokeh.plotting.figure()
second.circle(data.mean_temperature, data.min_temperature)
second.line(line_data, line_data)
tab2 = bokeh.models.Panel(child = second, title = 'Second Plot')

tabs = bokeh.models.Tabs(tabs = [tab1, tab2])
bokeh.plotting.show(tabs)


start_value = 5
point_size = [start_value for i in range(len(data))]

source = bokeh.models.ColumnDataSource(data = dict(x = data.mean_temperature, y = data.dew_point, s = point_size))

plot = bokeh.plotting.figure()
plot.circle('x', 'y', size = 's', source = source)

callback = bokeh.models.CustomJS(args = dict(source = source), code="""
        var data = source.get('data');
        var size = cb_obj.get('value');
        x = data['x'];
        y = data['y'];
        s = data['s'];
        for (i = 0; i < x.length; i++) {
            s[i] = size;
        }
        source.trigger('change');
    """)

size = bokeh.models.Slider(start = 1, end = 10, value = start_value, step = .05, title = "Size", callback = callback)

layout = bokeh.io.vform(size, plot)

bokeh.plotting.show(layout)



In [None]:
##Display data for a point while hovering over it

from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool

tooltip_source = bokeh.plotting.ColumnDataSource(data = dict(x = data.mean_temperature, y = data.dew_point,\
                                                             month = data.month, day = data.day, year = data.year))

hover = bokeh.models.HoverTool(
        tooltips = [
            ("Date", "@month-@day-@year"),
            ("Average Temperature", "@x"),
            ("Dew Point", "@y")
        ]
    )

fig = figure(tools = [hover], title = "Hover over the dots!")

fig.circle('x', 'y', source = tooltip_source)

show(fig)