# Target: Predict Max Temperature of next Day

In [1]:
# Pandas is used for data manipulation
import pandas as pd
import datetime as datetime
import numpy as np
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [4]:
# Read in data and display first 5 rows
features = pd.read_csv('../../wetterstation/data/messwerte_mythenquai_2019.csv')
features.timestamp_cet = pd.to_datetime(features.timestamp_cet, infer_datetime_format=True)
features.head(5)

Unnamed: 0,timestamp_cet,air_temperature,barometric_pressure_qfe,dew_point,global_radiation,humidity,precipitation,water_temperature,wind_direction,wind_force_avg_10min,wind_gust_max_10min,wind_speed_avg_10min,windchill
0,2019-01-07 23:10:00+00:00,3.0,981.0,0.1,0.0,81.0,0.0,6.2,273.0,2.0,5.3,2.6,0.0
1,2019-01-07 23:20:00+00:00,2.9,980.9,0.0,0.0,81.0,0.0,6.2,273.0,2.0,4.5,2.2,0.9
2,2019-01-07 23:30:00+00:00,2.8,980.7,0.1,0.0,82.0,0.0,6.2,307.0,2.0,6.8,3.1,-0.2
3,2019-01-07 23:40:00+00:00,2.8,980.4,0.2,0.0,83.0,0.0,6.2,290.0,2.0,3.3,1.9,1.2
4,2019-01-07 23:50:00+00:00,2.8,980.3,0.3,0.0,83.0,0.0,6.2,287.0,1.0,3.4,1.5,1.8


In [5]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44707 entries, 0 to 44706
Data columns (total 13 columns):
timestamp_cet              44707 non-null datetime64[ns, UTC]
air_temperature            44707 non-null float64
barometric_pressure_qfe    44707 non-null float64
dew_point                  44707 non-null float64
global_radiation           44707 non-null float64
humidity                   44707 non-null float64
precipitation              44707 non-null float64
water_temperature          44707 non-null float64
wind_direction             44707 non-null float64
wind_force_avg_10min       44707 non-null float64
wind_gust_max_10min        44707 non-null float64
wind_speed_avg_10min       44707 non-null float64
windchill                  44707 non-null float64
dtypes: datetime64[ns, UTC](1), float64(12)
memory usage: 4.4 MB


In [6]:
features_actual = features.groupby(by=features['timestamp_cet'].dt.date).max()
features_average = features.groupby(by=features['timestamp_cet'].dt.date).mean()

In [7]:
features_average['actual'] = features_actual['air_temperature']

In [8]:
features = features_average

In [9]:
print('The shape of our features is:', features.shape)

The shape of our features is: (312, 13)


In [10]:
features.describe()

Unnamed: 0,air_temperature,barometric_pressure_qfe,dew_point,global_radiation,humidity,precipitation,water_temperature,wind_direction,wind_force_avg_10min,wind_gust_max_10min,wind_speed_avg_10min,windchill,actual
count,312.0,312.0,312.0,312.0,312.0,312.0,312.0,312.0,312.0,312.0,312.0,312.0,312.0
mean,12.938776,816.483541,6.319553,128.279022,62.202635,21.511409,13.780309,296.449538,1.438941,5.375159,28.741081,21.100038,17.79359
std,7.235012,345.895131,5.893309,104.407656,28.516257,54.911271,6.661448,298.031301,0.815886,3.985609,66.38506,24.875628,8.499595
min,-1.438889,18.430556,-6.461111,0.0,0.0,0.0,4.600694,0.0,0.0,0.0,0.0,-2.964583,-0.2
25%,7.292708,958.023958,1.843403,37.024306,58.59375,0.0,6.547222,148.942708,1.151042,2.812686,1.609028,4.889757,11.3
50%,13.029514,967.259785,5.125,106.392361,71.53125,0.0,13.785069,184.121528,1.579861,3.927411,2.195833,12.236806,17.8
75%,18.491146,971.309444,11.091667,209.862847,81.110915,0.013889,18.967882,226.347222,1.953125,6.668403,3.189583,22.082465,23.95
max,28.496528,986.964583,20.468056,341.826389,96.576389,286.027778,26.888194,982.645139,3.131944,17.6125,290.166667,93.916667,37.4


In [11]:
plot = features.copy()
# Labels are the values we want to predict
labels = np.array(features['actual'])

In [12]:
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)

In [13]:
# Saving feature names for later use
feature_list = list(features.columns)
feature_list

['air_temperature',
 'barometric_pressure_qfe',
 'dew_point',
 'global_radiation',
 'humidity',
 'precipitation',
 'water_temperature',
 'wind_direction',
 'wind_force_avg_10min',
 'wind_gust_max_10min',
 'wind_speed_avg_10min',
 'windchill']

In [14]:
# Convert to numpy array
plot = features.copy()
features = np.array(features)
features

array([[ 2.86000000e+00,  9.80660000e+02,  1.40000000e-01, ...,
         4.66000000e+00,  2.26000000e+00,  7.40000000e-01],
       [ 3.08680556e+00,  9.73499306e+02,  1.21111111e+00, ...,
         1.01201389e+01,  4.15416667e+00, -1.16250000e+00],
       [ 1.67412587e+00,  9.67318182e+02, -2.68531469e-01, ...,
         5.41678322e+00,  2.59090909e+00, -8.67132867e-01],
       ...,
       [ 6.29305556e+00,  9.59462500e+02,  1.58750000e+00, ...,
         3.88263889e+00,  1.97430556e+00,  4.88194444e+00],
       [ 4.68601399e+00,  9.57355245e+02,  6.05594406e-01, ...,
         2.80699301e+00,  1.39300699e+00,  4.01188811e+00],
       [ 3.14736842e+00,  9.54140000e+02,  4.61052632e-01, ...,
         3.33789474e+00,  2.08210526e+00,  1.31684211e+00]])

In [15]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [16]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (234, 12)
Training Labels Shape: (234,)
Testing Features Shape: (78, 12)
Testing Labels Shape: (78,)


In [17]:
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('air_temperature')]

In [18]:
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))

Average baseline error:  4.96


In [19]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [20]:
# Train the model on training data
rf.fit(train_features, train_labels);

In [21]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

In [22]:
# Calculate the absolute errors
errors = abs(predictions - test_labels)

In [23]:
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 1.28 degrees.


In [24]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 114.57 %.


In [25]:
print(predictions[-1:])

[14.3126]
