***
# <font color=indianred size=10>PREDICTING DOLLAR AGAINST REAL</font>
***

# <font color=lightcoral>1) INTRODUCTION</font>
***

This dataset intend to predict the price of DOLLAR against REAL.

Translating the columns, we have:
    - Data = Date
    - Price = último
    - Open = Abertura
    - High = Máxima
    - Low = Mínima
    - Change% = Var%

The source of data is investing.com

Never use this for taking any decisions or put some money. It's just for study.

# <font color=lightcoral>2) IMPORTING DATA AND LIBRARIES</font>
***

In [62]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [63]:
df_dollar = pd.read_csv('df_dolar.csv')
df_dollar = df_dollar.drop(columns=['Var%'])
df_dollar['Data'] = pd.to_datetime(df_dolar['Data'], format='%d.%m.%Y')
df_dollar['Último'] = df_dollar['Último'].str.replace(',', '.').astype('float64')
df_dollar['Abertura'] = df_dollar['Abertura'].str.replace(',', '.').astype('float64')
df_dollar['Máxima'] = df_dollar['Máxima'].str.replace(',', '.').astype('float64')
df_dollar['Mínima'] = df_dollar['Mínima'].str.replace(',', '.').astype('float64')

df_dollar.head(5)

Unnamed: 0,Data,Último,Abertura,Máxima,Mínima
0,2021-04-06,5.5904,5.6642,5.6698,5.5742
1,2021-04-05,5.6632,5.71,5.7101,5.6323
2,2021-04-02,5.7075,5.708,5.708,5.708
3,2021-04-01,5.7071,5.633,5.7276,5.608
4,2021-03-31,5.6315,5.774,5.7768,5.6214


# <font color=lightcoral>3) TRANSLATING THE DATA FROM PORTUGUESE TO ENGLISH</font>
***

In [64]:
df_dollar= df_dollar.rename(columns={'Data': 'date'})
df_dollar= df_dollar.rename(columns={'Último': 'price'})
df_dollar= df_dollar.rename(columns={'Abertura': 'open'})
df_dollar= df_dollar.rename(columns={'Máxima': 'high'})
df_dollar= df_dollar.rename(columns={'Mínima': 'low'})

In [65]:
df_dollar = df_dollar.sort_values(by='date')
df_dollar.head(10)

Unnamed: 0,date,price,open,high,low
4192,2005-03-07,2.6815,2.654,2.689,2.648
4191,2005-03-08,2.697,2.677,2.7035,2.671
4190,2005-03-09,2.7145,2.692,2.731,2.69
4189,2005-03-10,2.7165,2.717,2.741,2.707
4188,2005-03-11,2.7168,2.712,2.723,2.691
4187,2005-03-14,2.7513,2.718,2.773,2.7168
4186,2005-03-15,2.7644,2.748,2.781,2.738
4185,2005-03-16,2.7623,2.7638,2.772,2.737
4184,2005-03-17,2.7175,2.762,2.7858,2.7095
4183,2005-03-18,2.716,2.718,2.7275,2.706


In [66]:
df_dollar.dtypes

date     datetime64[ns]
price           float64
open            float64
high            float64
low             float64
dtype: object

# <font color=lightcoral>4) CREATING MOVING AVERAGE</font>
***

Moving Average is one of the most principal variables used in this kind of business.

In [67]:
df_dollar['rol5'] = df_dollar['price'].rolling(5).mean().shift(-1)
df_dollar['rol17'] = df_dollar['price'].rolling(17).mean().shift(-1)
df_dollar.head(10)

Unnamed: 0,date,price,open,high,low,rol5,rol17
4192,2005-03-07,2.6815,2.654,2.689,2.648,,
4191,2005-03-08,2.697,2.677,2.7035,2.671,,
4190,2005-03-09,2.7145,2.692,2.731,2.69,,
4189,2005-03-10,2.7165,2.717,2.741,2.707,2.70526,
4188,2005-03-11,2.7168,2.712,2.723,2.691,2.71922,
4187,2005-03-14,2.7513,2.718,2.773,2.7168,2.7327,
4186,2005-03-15,2.7644,2.748,2.781,2.738,2.74226,
4185,2005-03-16,2.7623,2.7638,2.772,2.737,2.74246,
4184,2005-03-17,2.7175,2.762,2.7858,2.7095,2.7423,
4183,2005-03-18,2.716,2.718,2.7275,2.706,2.73734,


In [68]:
# Dropping NA values

df_dollar.dropna(inplace=True)
df_dollar

Unnamed: 0,date,price,open,high,low,rol5,rol17
4177,2005-03-28,2.7255,2.7392,2.7515,2.7160,2.72970,2.726782
4176,2005-03-29,2.6980,2.7255,2.7271,2.6935,2.71550,2.726429
4175,2005-03-30,2.6755,2.6960,2.7085,2.6705,2.70350,2.725429
4174,2005-03-31,2.6800,2.6760,2.6885,2.6520,2.68756,2.722153
4173,2005-04-01,2.6588,2.6660,2.6775,2.6420,2.67066,2.717712
...,...,...,...,...,...,...,...
5,2021-03-30,5.7745,5.7824,5.8034,5.7200,5.71784,5.627971
4,2021-03-31,5.6315,5.7740,5.7768,5.6214,5.72976,5.622359
3,2021-04-01,5.7071,5.6330,5.7276,5.6080,5.72010,5.624429
2,2021-04-02,5.7075,5.7080,5.7080,5.7080,5.69676,5.631929


# <font color=lightcoral>5) MACHINE LEARNING</font>
***

In [69]:
lines = len(df_dollar)

train_lines= round(.70 * lines)
test_lines= lines - train_lines  
Validation_lines = lines -1

info = (
    f" Train Lines = 0:{train_lines}"
    f" Test Lines = {train_lines}:{train_lines + train_lines -1}"
    f" Validation Lines = {Validation_lines}"
)

info

' Train Lines = 0:2924 Test Lines = 2924:5847 Validation Lines = 4176'

In [71]:
# Discovering the best features and labels
features = df_dollar.drop(['date', 'price'], 1)
labels = df_dollar['price']

In [72]:
features_list = ('open','high','low','rol5','rol17')

k_best_features = SelectKBest(k='all')
k_best_features.fit_transform(features, labels)
k_best_features_scores = k_best_features.scores_
raw_pairs = zip(features_list[1:], k_best_features_scores)
ordered_pairs = list(reversed(sorted(raw_pairs, key=lambda x: x[1])))

k_best_features_final = dict(ordered_pairs[:15])
best_features = k_best_features_final.keys()
print ('')
print ("Best features:")
print (k_best_features_final)


Best features:
{'low': 4956.672980190905, 'rol5': 4845.319578909642, 'rol17': 4483.666963131367, 'high': 2179.3835211480814}


In [73]:
features = df_dollar.loc[:,['low','rol5','rol17','high']]
features.head(3)

Unnamed: 0,low,rol5,rol17,high
4177,2.716,2.7297,2.726782,2.7515
4176,2.6935,2.7155,2.726429,2.7271
4175,2.6705,2.7035,2.725429,2.7085


In [74]:
X_train = features[:train_lines]
X_test = features[train_lines:train_lines + test_lines -20]

y_train = labels[:train_lines]
y_test = labels[train_lines:train_lines + test_lines -20]

print( len(X_train), len(y_train))

print( len(X_test), len(y_test))

2924 2924
1233 1233


In [75]:
scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train) 
X_test_scale  = scaler.transform(X_test)  

lr = linear_model.LinearRegression()
lr.fit(X_train_scale, y_train)
pred= lr.predict(X_test_scale)
cd =r2_score(y_test, pred)

f'R = {cd * 100:.2f}'

'R = 99.92'

# <font color=lightcoral>5.1) TESTING MODEL</font>
***

In [78]:
testing_model = features.tail(20)
testing_model

Unnamed: 0,low,rol5,rol17,high
20,5.7675,5.74208,5.569406,5.8787
19,5.6452,5.71536,5.579182,5.8134
18,5.5257,5.68756,5.587494,5.6215
17,5.5324,5.63558,5.598588,5.5873
16,5.531,5.60004,5.612853,5.6568
15,5.5578,5.58252,5.619841,5.6319
14,5.5695,5.58746,5.626853,5.681
13,5.4763,5.57544,5.631882,5.586
12,5.4482,5.55322,5.630335,5.564
11,5.4936,5.53254,5.625794,5.5483


In [79]:
prediction=scaler.transform(testing_model)


pred=lr.predict(prediction)

pred

array([5.81952226, 5.729281  , 5.58787312, 5.56755656, 5.58888213,
       5.58798554, 5.61429794, 5.52834116, 5.50223538, 5.5141285 ,
       5.502409  , 5.55556894, 5.64108423, 5.69475951, 5.76615725,
       5.75832723, 5.70273743, 5.67426125, 5.70937199, 5.66793441])

In [82]:
df = df_dollar

date_full=df['date']
date=date_full.tail(20)

res_full=df['price']
res=res_full.tail(20)

df=pd.DataFrame({'date':date, 'real':res, 'prediction':pred})


df.set_index('date', inplace=True)

print(df)

              real  prediction
date                          
2021-03-09  5.8025    5.819522
2021-03-10  5.6723    5.729281
2021-03-11  5.5357    5.587873
2021-03-12  5.5518    5.567557
2021-03-15  5.6156    5.588882
2021-03-16  5.6248    5.587986
2021-03-17  5.5847    5.614298
2021-03-18  5.5604    5.528341
2021-03-19  5.4917    5.502235
2021-03-22  5.5045    5.514128
2021-03-23  5.5214    5.502409
2021-03-24  5.6209    5.555569
2021-03-25  5.6475    5.641084
2021-03-26  5.7558    5.694760
2021-03-29  5.7799    5.766157
2021-03-30  5.7745    5.758327
2021-03-31  5.6315    5.702737
2021-04-01  5.7071    5.674261
2021-04-02  5.7075    5.709372
2021-04-05  5.6632    5.667934
