In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
df_belem = pd.read_csv("../input/temperature-timeseries-for-some-brazilian-cities/station_belem.csv")
df_curitiba = pd.read_csv("../input/temperature-timeseries-for-some-brazilian-cities/station_curitiba.csv")

In [None]:
df_belem.head(2)

In [None]:
df_curitiba.head(2)

In [None]:
df_belem.info()

In [None]:
df_curitiba.info()

In [None]:
df_belem.describe()

In [None]:
df_curitiba.describe()

In [None]:
df_belem.hist()

In [None]:
df_curitiba.hist()

In [None]:
df_belem.replace(999.90, np.nan, inplace=True)
df_curitiba.replace(999.90, np.nan, inplace=True)

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='median')

In [None]:
num_cols_median_belem = imputer.fit_transform(df_belem[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']])
num_cols_median_curitiba = imputer.fit_transform(df_curitiba[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']])

In [None]:
df_belem[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']] = num_cols_median_belem
df_curitiba[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']] = num_cols_median_curitiba

In [None]:
df_belem_aux = df_belem.copy()
df_curitiba_aux = df_curitiba.copy()


In [None]:
df_belem_aux = df_belem_aux.drop(columns=['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC','D-J-F','M-A-M','J-J-A','S-O-N','metANN'])
df_curitiba_aux = df_curitiba_aux.drop(columns=['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC','D-J-F','M-A-M','J-J-A','S-O-N','metANN'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ax = plt.gca()
df_belem_aux.plot(x ='YEAR', y='JUL', kind = 'line', color='red',ax=ax)
df_curitiba_aux.plot(x ='YEAR', y='JUL', kind = 'line', color='blue',ax=ax)
ax.legend(["Belém - Julho", "Curitiba - Julho"])
plt.show()

In [None]:
import scipy.stats as stats

In [None]:
display(df_curitiba['JUL'].describe())
display(df_belem['JUL'].describe())
stats.f_oneway(df_belem['JUL'], df_curitiba['JUL'])

In [None]:
#comparando as duas temperaturas "estatísticamente"
frame = { 'YEAR': df_curitiba.YEAR, 'JAN': df_curitiba.JAN } 
df_prev=pd.DataFrame(frame)

In [None]:
df_prev.head(5)

In [None]:
import math
from sklearn import preprocessing, model_selection
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [None]:
df_prev_regressor = df_prev.copy()

In [None]:
df_prev_regressor['A1'] = df_prev_regressor['JAN'].shift(1)
df_prev_regressor['A2'] = df_prev_regressor['JAN'].shift(2)
df_prev_regressor['A3'] = df_prev_regressor['JAN'].shift(3)

df_prev_regressor = df_prev_regressor.dropna()
display(df_prev_regressor.head(5))

X_train, X_test, y_train, y_test = model_selection.train_test_split(df_prev_regressor.drop(columns=['JAN']),df_prev_regressor['JAN'],test_size=0.25, random_state=33)

In [None]:
model = xgb.XGBRegressor()
model.fit(X_train,y_train)
p_train = model.predict(X=X_train)
p_test = model.predict(X=X_test)

In [None]:
trainScore = math.sqrt(mean_squared_error(p_train, y_train))
print('Pontuação para o treinamento: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(p_test, y_test))
print('Pontuação para o teste: %.2f RMSE' % (testScore))

In [None]:
df_plot = pd.DataFrame({'YEAR': X_test.index, 'PRED': p_test, 'REAL': y_test}).reset_index(drop=True)
display(df_plot.sort_values(['YEAR']).set_index('YEAR'))
plt.figure(figsize=(10,10))
fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.scatter(df_plot['YEAR'],df_plot['PRED'] , color='r')
ax.scatter(df_plot['YEAR'],df_plot['REAL'] , color='b')
ax.set_xlabel('Ano')
ax.set_ylabel('Temperatura (ºC)')
ax.legend(["Curitiba - Janeiro - Previsto", "Curitiba - Janeiro - Real"])
ax.set_title('scatter plot')
plt.show()

In [None]:
df_prev.head()

In [None]:
df_prev_test=df_prev["JAN"].values

In [None]:
df_prev_test

In [None]:
#normaliza os dados
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
serie_prev = scaler.fit_transform(df_prev_test.reshape(-1, 1))

In [None]:
train=serie_prev[:int(len(serie_prev)*0.7)]
test=serie_prev[int(len(serie_prev)*0.7):]

In [None]:
print(len(train), len(test))

In [None]:
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

In [None]:
look_back = 3
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

In [None]:
#colocando os dados no formato para entrada na rede LSTM em deep learning [amostras, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], look_back, 1))
testX = np.reshape(testX, (testX.shape[0], look_back, 1))

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Flatten, LSTM

In [None]:
model = Sequential()
model.add(LSTM(4, input_shape=(look_back,1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
model.summary()

In [None]:
model.fit(trainX, trainY, epochs=5, batch_size=1, verbose=2)

In [None]:
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

In [None]:
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])

In [None]:
import math
from sklearn.metrics import mean_squared_error

In [None]:
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
print('Pontuação para o treinamento: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
print('Pontuação para o teste: %.2f RMSE' % (testScore))

In [None]:
trainPredictPlot = np.empty_like(serie_prev)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict)+look_back, :] = trainPredict

In [None]:
testPredictPlot = np.empty_like(serie_prev)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict)+(look_back*2)+1:len(serie_prev)-1, :] = testPredict

In [None]:
plt.plot(scaler.inverse_transform(serie_prev))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.legend(["Real","Previsao Treinamento","Previsão Teste"])
plt.show()