# Welcome to Jupyter!

In [1]:
import sys
import pandas as pd
import numpy as np
import datetime
!{sys.executable} -m pip install vega_datasets



# Let's get started with our data set.

Data cleaning and formatting is a huge part of data science. For ease I have made an example already but always remember to look at your data before you start any project!

In [2]:
from vega_datasets import data
Weather = pd.DataFrame(data.seattle_weather())
Weather['average'] = pd.to_numeric(Weather[['temp_max','temp_min']].mean(axis =1)+3)
data = Weather.iloc[:, [2, 3]]
Weather['actual'] = np.random.randint(data.temp_min, data.temp_max+1)
Final_Weather = Weather.drop(['date'], axis = 1)
Final_Weather.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather,average,actual
0,0.0,12.8,5.0,4.7,drizzle,11.9,9
1,10.9,10.6,2.8,4.5,rain,9.7,3
2,0.8,11.7,7.2,2.3,rain,12.45,8
3,20.3,12.2,5.6,4.7,rain,11.9,10
4,1.3,8.9,2.8,6.1,rain,8.85,3


Oh no! We have character fields, computers can't use that for calculation. One Hot Encoding to the rescue.

In [3]:
features = pd.get_dummies(Final_Weather)
features.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind,average,actual,weather_drizzle,weather_fog,weather_rain,weather_snow,weather_sun
0,0.0,12.8,5.0,4.7,11.9,9,1,0,0,0,0
1,10.9,10.6,2.8,4.5,9.7,3,0,0,1,0,0
2,0.8,11.7,7.2,2.3,12.45,8,0,0,1,0,0
3,20.3,12.2,5.6,4.7,11.9,10,0,0,1,0,0
4,1.3,8.9,2.8,6.1,8.85,3,0,0,1,0,0


In [4]:
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [5]:
#Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.45, random_state = 333)

In [6]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (803, 10)
Training Labels Shape: (803,)
Testing Features Shape: (658, 10)
Testing Labels Shape: (658,)


In [7]:
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('average')]
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2), 'degrees.')

Average baseline error:  3.84


In [8]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 18)
# Train the model on training data
rf.fit(train_features, train_labels);

In [9]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 2.54 degrees.
