# TF - toyproject

## Project

### Repository
This project source code and datasets may be found in our [gitlab](https://gitlab.com/carybe/fiot-tf) and [github](https://github.com/carybe/fiot-tf) (mirror) repositories.

## Collect Data
(try to store as many data as possible to save quota)

**As we had previously collected the data and stored it in IME's network,
this step isn't needed**

In [None]:
# Prerequisites:
# import requests

In [None]:
# API connections go here:

## Data Explore


In [None]:
# Prerequisites:
import pandas as pd
import numpy as np
# from google.colab import files

In [None]:
# Load dataframe from csv

# New York City Laguardia Airport Station Data:
df = pd.read_csv("../input/weather-underground-rain-predict/datasets/wu_ny.csv", delimiter=';')

# São Paulo International Airport Station Data:
# df = pd.read_csv("../input/weather-underground-rain-predict/datasets/wu_sp.csv", delimiter=';')

In [None]:
df.describe()

### Data Summary

In [None]:
print(df.dtypes)

In [None]:
print(df)

#### Categorical

In [None]:
# 'N/A' is translated to nan
df['Wind'].unique()

In [None]:
df['Condition'].unique()

#### Numerical

In [None]:
df.describe()

#### Dataframe Viz

In [None]:
from matplotlib import pyplot as plt
def show_raw_visualization(data):
  data = data.drop(['Condition','Wind'], axis=1)
  time_data = data['Time']
  data_features = data.keys()
  data_feature_count = len(data_features)
  fig, axes = plt.subplots(
    nrows=int(data_feature_count/2.0+0.5), ncols=2, figsize=(10, 15), dpi=80, facecolor="w", edgecolor="k"
  )
  colors = [
    "blue",
    "orange",
    "green",
    "red",
    "purple",
    "brown",
    "pink",
    "gray",
    "olive",
    "cyan",
  ]
  for i in range(len(data.keys())):
    key = data.keys()[i]
    c = colors[i % (len(colors))]
    t_data = data[key]
    t_data.index = time_data
    t_data.head()
    ax = t_data.plot(
      ax=axes[i // 2, i % 2],
      color=c,
      title="{}".format(key),
      rot=25,
      )
    ax.legend([key])
  plt.tight_layout()

show_raw_visualization(df)

##### Correlation Heatmap

In [None]:
def show_heatmap(data):
    plt.matshow(data.corr())
    plt.xticks(range(data.shape[1]), data.columns, fontsize=14, rotation=90)
    plt.gca().xaxis.tick_bottom()
    plt.yticks(range(data.shape[1]), data.columns, fontsize=14)

    cb = plt.colorbar()
    cb.ax.tick_params(labelsize=14)
    plt.title("Feature Correlation Heatmap", fontsize=14)
    plt.show()

show_heatmap(df.drop(['Condition','Wind'], axis=1))

## Preprocess Data

### Data Clean

In [None]:
df1 = df.copy(deep=True)

In [None]:
df1 = df1.dropna()

In [None]:
# General data restriction
# Any numerical data that equals 0.0 is probably a
# default value for missing data at the server

df1 = df1[df1['Time'] > 0]

df1 = df1[df1['Temperature'] > -100]

df1 = df1[df1['Dew Point'] > -100]

df1 = df1[df1['Humidity'] >= 0]

# Purged at `.dropna()`
# df1 = df1[df1['Wind']]

df1 = df1[df1['Wind Speed'] >= 0]

df1 = df1[df1['Wind Gust'] >= 0]

df1 = df1[df1['Pressure'] >= 10]

df1 = df1[df1['Precip.'] >= 0]

# Purged at `.dropna()`
# df1 = df1[df1['Condition']]

In [None]:
## SP specific restrictions:

# We got no freezing temps
# df1 = df1[df1['Temperature'] > 0]
# df1 = df1[df1['Dew Point'] > 0]

# Even with variable Atmospheric pressure, the altitude will keep it higher
# than some threshold (here 700hPa)
# df1 = df1[df1['Pressure'] > 700]


## NYC specific restrictions:

# Removes 0°F (default value for temp and dew point)
df1 = df1.loc[(df1['Temperature'] <= -17.77778) | (df1['Temperature'] >= -17.77776)]
df1 = df1.loc[(df1['Dew Point'] <= -17.77778) | (df1['Dew Point'] >= -17.77776)]

# Even with variable Atmospheric pressure, the altitude will keep it higher
# than some threshold (here 800hPa)
df1 = df1[df1['Pressure'] > 800]


In [None]:
dropped_data_ratio = 1 - df1['Condition'].count() / df['Condition'].count()
print("Dropped datapoints:", dropped_data_ratio * 100, '%')

In [None]:
# Cleaned dataframe
show_raw_visualization(df1)

### Data Grouping and Enhacing

In [None]:
# Translate from Epoch
df1['TimeAgg'] = pd.to_datetime(df1['Time'],unit='s')

In [None]:
# "has_rain definition"

# If it has precipitation -> it has rain
def yprecipitations(preciptation):
  return preciptation > 0

# If it has "Drizzle", "Rain", "Shower" or "Storm" in its Condition -> it has rain
def yconditions(condition):
  import re
  if re.match(".*(Drizzle|Rain|Shower|Storm).*", condition):
    return 1
  else:
    return 0

# Checks whether the dataframe has at least 1% of precipitation field filled
if df1[df1["Precip."] > 0]['Precip.'].count() / df1['Condition'].count() > 0.01:
  target_field = 'Precip.'
  threshold_func = yprecipitations
  print("Using Preciptation feature")

# Otherwise, we need to inspect the conditions
else:
  target_field = 'Condition'
  threshold_func = yconditions
  print("Using Condition feature")

labelfunc = np.vectorize(threshold_func)
rain_class = labelfunc(df1[[target_field]])
df1[['has_rain']] = rain_class

In [None]:
rain_ratio = df1[df1['has_rain'] == True]['has_rain'].count() / df1['has_rain'].count()
print('It rains %.2f%% of the time' %(100*rain_ratio))

In [None]:
df1.set_index('TimeAgg',inplace=True, drop=True)
print(df1)

In [None]:
df6hp = df1[['Temperature',
             'Humidity',
             'Pressure',
             'Precip.',
             'has_rain']].resample('6h').agg({'min',
                                              'max',
                                              'mean', 
                                              lambda x: x.quantile(0.7), 
                                              lambda y: y.quantile(0.9),
                                              lambda z: z.quantile(0.3),
                                              lambda o: o.quantile(0.1),
                                              lambda a: (a > a.shift()).sum(),
                                              lambda b: (b < b.shift()).sum(),
                                              lambda c: (c == c.shift()).sum()})

# MIN, MAX, MEAN, 70PERCENTILE, 90PERCENTILE, 30PERCENTILE, 10PERCENTILE, RISE, FALL, STEADY

In [None]:
df6hp.columns = ["_".join(x) for x in df6hp.columns.ravel()]
# Selecting has_rain_max as target variable
df6hp['has_rain'] = df6hp['has_rain_max']
df6hp.dtypes

In [None]:
# Data enhancing
# For each feature and "lambda" feature, group the last 8 measurements,
# by shifting its data: Feature_steady_(N) = Feature_steady_(N-1).shift()

for feature in ['Pressure', 'Temperature', 'Humidity', 'Precip.']:
  df6hp['%s_steady_1' % (feature)] = df6hp['%s_mean' % (feature)].shift()
  df6hp['%s_min_steady_1' % (feature)] = df6hp['%s_min' % (feature)].shift()
  df6hp['%s_max_steady_1' % (feature)] = df6hp['%s_max' % (feature)].shift()
  for i in range(7):
    df6hp['%s_steady_%d' % (feature, i+2)] = df6hp['%s_steady_%d' % (feature, i+1)].shift()
    df6hp['%s_min_steady_%d' % (feature, i+2)] = df6hp['%s_min_steady_%d' % (feature, i+1)].shift()
    df6hp['%s_max_steady_%d' % (feature, i+2)] = df6hp['%s_max_steady_%d' % (feature, i+1)].shift()
  for j in range(7):
    df6hp['%s_<lambda_%d>_1' % (feature, j)] = df6hp['%s_<lambda_%d>' % (feature, j)].shift()
    for i in range(7):
      df6hp['%s_<lambda_%d>_%d' % (feature, j, i+2)] = df6hp['%s_<lambda_%d>_%d' % (feature, j, i+1)].shift()


In [None]:
df6hp.keys()

In [None]:
df6hp_old = df6hp.copy(deep=True)

# 2nd Data clean, removing NaNs generated from grouping
df6hp = df6hp.dropna()

# This column was dropped at df6hp definition
# df6hp = df6hp.drop(columns=['TimeAgg'])

# Features that have not been grouped are removed
non_agg_feat_list = list()
for feature in ['Pressure', 'Temperature', 'Humidity', 'Precip.','has_rain']:
  non_agg_feat_list.append("%s_mean" % (feature))
  non_agg_feat_list.append("%s_min" % (feature))
  non_agg_feat_list.append("%s_max" % (feature))
  for i in range(7):
    non_agg_feat_list.append("%s_<lambda_%d>" % (feature, i))
# print(non_agg_feat_list)
df6hp.drop(columns=non_agg_feat_list, inplace=True)

dropped_data_ratio = 1 - df6hp['has_rain'].count() / df6hp_old['has_rain'].count()
print("Dropped datapoints:", dropped_data_ratio * 100, "%")

In [None]:
# Export the enhaced dataset
df6hp.to_csv(r'dfFinal.csv')
# files.download("dfFinal.csv")

## Model data input prepare

In [None]:
# Prerequisites
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
# from sklearn.externals import joblib
import pandas as pd

### Load Saved data

In [None]:
df6hp = pd.read_csv("dfFinal.csv")

# Drop unecessary index
df6hp = df6hp.drop(columns=['TimeAgg'])

In [None]:
# Splitting the data in train and test data
main_data, test_data = train_test_split(df6hp, test_size=.65) 

In [None]:
# Split each of the train and test data into inputs and outputs
X = main_data.iloc[:, 1:].values
y = main_data.iloc[:, 0].values

X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

In [None]:
# Rescale the data so it's between 0 and 1
scaler = StandardScaler()
scaler_test = StandardScaler()

X = scaler.fit_transform(X) 
X_test = scaler.fit_transform(X_test) 

## Train Models

In [None]:
# Prerequisites:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout 

### Definitions

In [None]:
input_dimensions = X.shape[1] 
optimizer = 'rmsprop'
dropout = 0.05
kernel_init = 'uniform'
batch_size = 100
epochs = 100 

### Model construction

In [None]:
# from https://create.arduino.cc/projecthub/danionescu/diy-rain-prediction-using-arduino-python-and-keras-abdec6
model = Sequential()
inner_nodes = int(input_dimensions / 2)

model.add(Dense(inner_nodes, kernel_initializer='uniform', activation='relu', input_dim=input_dimensions))
model.add(Dropout(rate=dropout))

model.add(Dense(inner_nodes, kernel_initializer='uniform', activation='relu'))
model.add(Dropout(rate=dropout))

model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['accuracy']) 

### Model Training

In [None]:
model.fit(X,
          y,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test))

### Save Model

In [None]:
# save the model for later use
model.save('Model1.h5')
# files.download("Model1.h5")

## Evaluate Model

In [None]:
import matplotlib.pyplot as plt

plt.plot(model.history.history['accuracy'], 'r-')
plt.plot(model.history.history['val_accuracy'], 'g:')

plt.xlabel('Epoch')

plt.legend(['Train', 'Test'], loc='upper left')

In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(X_test, y_test, batch_size=128)
print("test loss, test acc:", results)

In [None]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(X_test[:20])
print("predictions shape:", predictions.shape)

In [None]:
print(predictions)
print(y_test[:20])

## References:


 - [DIY Rain Prediction Using Arduino, Python and Keras](https://create.arduino.cc/projecthub/danionescu/diy-rain-prediction-using-arduino-python-and-keras-abdec6)
 - [Keras Timeseries forecasting for weather prediction](https://keras.io/examples/timeseries/timeseries_weather_forecasting/)

 - [TensorFlow Keras Tutorial](https://www.tensorflow.org/tutorials/keras/classification)