# Target: Predict Max Temperature of next Day

In [1]:
# Pandas is used for data manipulation
import pandas as pd
import datetime as datetime
import numpy as np
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [2]:
# Read in data and display first 5 rows
features = pd.read_csv('../../../../messwerte_mythenquai_2019.csv')
features.timestamp_cet = pd.to_datetime(features.timestamp_cet, infer_datetime_format=True)
features.head(5)

Unnamed: 0,timestamp_cet,air_temperature,barometric_pressure_qfe,dew_point,global_radiation,humidity,precipitation,water_temperature,wind_direction,wind_force_avg_10min,wind_gust_max_10min,wind_speed_avg_10min,windchill
0,2019-01-07 23:10:00+00:00,3.0,981.0,0.1,0.0,81.0,0.0,6.2,273.0,2.0,5.3,2.6,0.0
1,2019-01-07 23:20:00+00:00,2.9,980.9,0.0,0.0,81.0,0.0,6.2,273.0,2.0,4.5,2.2,0.9
2,2019-01-07 23:30:00+00:00,2.8,980.7,0.1,0.0,82.0,0.0,6.2,307.0,2.0,6.8,3.1,-0.2
3,2019-01-07 23:40:00+00:00,2.8,980.4,0.2,0.0,83.0,0.0,6.2,290.0,2.0,3.3,1.9,1.2
4,2019-01-07 23:50:00+00:00,2.8,980.3,0.3,0.0,83.0,0.0,6.2,287.0,1.0,3.4,1.5,1.8


In [3]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31000 entries, 0 to 30999
Data columns (total 13 columns):
timestamp_cet              31000 non-null datetime64[ns, UTC]
air_temperature            31000 non-null float64
barometric_pressure_qfe    31000 non-null float64
dew_point                  31000 non-null float64
global_radiation           31000 non-null float64
humidity                   31000 non-null float64
precipitation              31000 non-null float64
water_temperature          31000 non-null float64
wind_direction             31000 non-null float64
wind_force_avg_10min       31000 non-null float64
wind_gust_max_10min        31000 non-null float64
wind_speed_avg_10min       31000 non-null float64
windchill                  31000 non-null float64
dtypes: datetime64[ns, UTC](1), float64(12)
memory usage: 3.1 MB


In [4]:
features_actual = features.groupby(by=features['timestamp_cet'].dt.date).max()
features_average = features.groupby(by=features['timestamp_cet'].dt.date).mean()

In [5]:
features_average['actual'] = features_actual['air_temperature']

In [6]:
features = features_average

In [7]:
print('The shape of our features is:', features.shape)

The shape of our features is: (217, 13)


In [8]:
features.describe()

Unnamed: 0,air_temperature,barometric_pressure_qfe,dew_point,global_radiation,humidity,precipitation,water_temperature,wind_direction,wind_force_avg_10min,wind_gust_max_10min,wind_speed_avg_10min,windchill,actual
count,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0
mean,12.146115,968.829543,6.619973,167.568965,71.570638,0.00448,12.651433,162.866276,1.709761,3.965718,2.116036,10.836246,17.035023
std,7.981956,7.702195,6.556612,96.971536,11.209503,0.014361,7.469339,56.189721,0.625392,2.182405,0.96995,8.684042,9.218353
min,-1.438889,943.516667,-6.461111,0.0,47.166667,0.0,4.600694,0.0,0.0,0.0,0.0,-2.964583,-0.2
25%,5.516667,965.459722,1.113889,86.326389,62.895833,0.0,5.593056,138.597222,1.409722,2.564583,1.502778,3.83125,10.1
50%,10.540972,968.763889,5.191667,158.8125,71.819444,0.0,11.072917,166.055556,1.708333,3.604167,1.999306,9.279167,16.0
75%,19.101389,973.125694,12.571528,256.104167,80.375,0.0,20.253472,196.701389,2.097222,5.077778,2.6375,18.647917,23.8
max,28.496528,986.964583,20.468056,341.826389,96.576389,0.104167,26.888194,289.643357,3.131944,12.585417,4.811806,29.218056,37.4


In [9]:
plot = features.copy()
# Labels are the values we want to predict
labels = np.array(features['actual'])

In [10]:
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)

In [11]:
# Saving feature names for later use
feature_list = list(features.columns)
feature_list

['air_temperature',
 'barometric_pressure_qfe',
 'dew_point',
 'global_radiation',
 'humidity',
 'precipitation',
 'water_temperature',
 'wind_direction',
 'wind_force_avg_10min',
 'wind_gust_max_10min',
 'wind_speed_avg_10min',
 'windchill']

In [12]:
# Convert to numpy array
plot = features.copy()
features = np.array(features)
features

array([[ 2.86000000e+00,  9.80660000e+02,  1.40000000e-01, ...,
         4.66000000e+00,  2.26000000e+00,  7.40000000e-01],
       [ 3.08680556e+00,  9.73499306e+02,  1.21111111e+00, ...,
         1.01201389e+01,  4.15416667e+00, -1.16250000e+00],
       [ 1.67412587e+00,  9.67318182e+02, -2.68531469e-01, ...,
         5.41678322e+00,  2.59090909e+00, -8.67132867e-01],
       ...,
       [ 2.47798611e+01,  9.65002083e+02,  1.84902778e+01, ...,
         2.49930556e+00,  1.41875000e+00,  2.51875000e+01],
       [ 2.02618056e+01,  9.70615278e+02,  1.76652778e+01, ...,
         2.22430556e+00,  1.12222222e+00,  2.04326389e+01],
       [ 1.67000000e+01,  9.71405263e+02,  1.48526316e+01, ...,
         7.86842105e-01,  3.92105263e-01,  1.67131579e+01]])

In [13]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [14]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (162, 12)
Training Labels Shape: (162,)
Testing Features Shape: (55, 12)
Testing Labels Shape: (55,)


In [15]:
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('air_temperature')]

In [16]:
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))

Average baseline error:  4.8


In [17]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [18]:
# Train the model on training data
rf.fit(train_features, train_labels);

In [19]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

In [20]:
# Calculate the absolute errors
errors = abs(predictions - test_labels)

In [21]:
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 1.47 degrees.


In [22]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 100.25 %.


In [23]:
print(predictions[-1:])

[13.6143]
