In [198]:
import pandas as pd

In [199]:
df = pd.read_csv('../usdvnd/usdvnd_cleaned.csv')
df.set_index('Ngày', inplace=True)
df.index = pd.to_datetime(df.index)

In [200]:
# feature engineering

# lag features
# 1 day
df['Lần cuối 1'] = df['Lần cuối'].shift(1)
df['% Thay đổi 1'] = df['% Thay đổi'].shift(1)

# 3 days
df['Lần cuối 3'] = df['Lần cuối'].shift(3)

# 7 days
df['Lần cuối 7'] = df['Lần cuối'].shift(7)


In [201]:
# print Nan
print(df.isna().sum())

Lần cuối        0
Mở              0
Cao             0
Thấp            0
% Thay đổi      0
Lần cuối 1      1
% Thay đổi 1    1
Lần cuối 3      3
Lần cuối 7      7
dtype: int64


In [202]:
# sliding window features

# moving average 7 and 30 days
df['MA 7'] = df['Lần cuối'].rolling(window=7).mean()
df['MA 30'] = df['Lần cuối'].rolling(window=30).mean()

# volatility 7 days
df['Std dev 7'] = df['Lần cuối'].rolling(window=7).std()

# relative strength index 14 days
def calculate_rsi(data, window=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - 100 / (1 + rs)
    rsi = rsi.fillna(50) # prevent division by zero
    return rsi
    
df['RSI 14'] = calculate_rsi(df['Lần cuối'])

In [203]:
# preview since 1994-09-01
df.loc['1994-09-01':].head(20)

Unnamed: 0_level_0,Lần cuối,Mở,Cao,Thấp,% Thay đổi,Lần cuối 1,% Thay đổi 1,Lần cuối 3,Lần cuối 7,MA 7,MA 30,Std dev 7,RSI 14
Ngày,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1994-09-01,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10985.0,,3.741657,100.0
1994-09-02,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10986.0,,3.41565,100.0
1994-09-03,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10987.0,10982.4,2.645751,100.0
1994-09-04,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10988.0,10982.633333,0.0,100.0
1994-09-05,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10982.866667,0.0,100.0
1994-09-06,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.1,0.0,100.0
1994-09-07,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.333333,0.0,100.0
1994-09-08,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.566667,0.0,100.0
1994-09-09,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.8,0.0,100.0
1994-09-10,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10984.033333,0.0,100.0


In [204]:
# time features
df['Ngày thứ'] = df.index.dayofweek # 0 is Monday
df['Tháng thứ'] = df.index.month
df['Quý thứ'] = df.index.quarter
df['Năm thứ'] = df.index.year

In [205]:
# preview since 1994-09-01
df.loc['1994-09-01':].head(20)

Unnamed: 0_level_0,Lần cuối,Mở,Cao,Thấp,% Thay đổi,Lần cuối 1,% Thay đổi 1,Lần cuối 3,Lần cuối 7,MA 7,MA 30,Std dev 7,RSI 14,Ngày thứ,Tháng thứ,Quý thứ,Năm thứ
Ngày,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1994-09-01,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10985.0,,3.741657,100.0,3,9,3,1994
1994-09-02,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10986.0,,3.41565,100.0,4,9,3,1994
1994-09-03,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10987.0,10982.4,2.645751,100.0,5,9,3,1994
1994-09-04,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10988.0,10982.633333,0.0,100.0,6,9,3,1994
1994-09-05,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10982.866667,0.0,100.0,0,9,3,1994
1994-09-06,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.1,0.0,100.0,1,9,3,1994
1994-09-07,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.333333,0.0,100.0,2,9,3,1994
1994-09-08,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.566667,0.0,100.0,3,9,3,1994
1994-09-09,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.8,0.0,100.0,4,9,3,1994
1994-09-10,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10984.033333,0.0,100.0,5,9,3,1994


In [206]:
print(f"Number of rows before dropna: {len(df)}")

# drop rows with NaN
df = df.dropna()

print(f"Number of rows after dropna: {len(df)}")
print("\nPreview the new features:")
print(df.head())

Number of rows before dropna: 11418
Number of rows after dropna: 11389

Preview the new features:
            Lần cuối       Mở      Cao     Thấp  % Thay đổi  Lần cuối 1  \
Ngày                                                                      
1994-09-03   10988.0  10988.0  10988.0  10988.0         0.0     10988.0   
1994-09-04   10988.0  10988.0  10988.0  10988.0         0.0     10988.0   
1994-09-05   10988.0  10988.0  10988.0  10988.0         0.0     10988.0   
1994-09-06   10988.0  10988.0  10988.0  10988.0         0.0     10988.0   
1994-09-07   10988.0  10988.0  10988.0  10988.0         0.0     10988.0   

            % Thay đổi 1  Lần cuối 3  Lần cuối 7     MA 7         MA 30  \
Ngày                                                                      
1994-09-03           0.0     10988.0     10981.0  10987.0  10982.400000   
1994-09-04           0.0     10988.0     10981.0  10988.0  10982.633333   
1994-09-05           0.0     10988.0     10988.0  10988.0  10982.866667   
1

In [None]:
# save data
# convert index to column
df['Ngày'] = df.index
df.to_csv('../usdvnd/processed/feature_engineered.csv', index=False)

In [208]:
# preview since 1994-09-01
df.loc['1994-09-01':].head(20)

Unnamed: 0_level_0,Lần cuối,Mở,Cao,Thấp,% Thay đổi,Lần cuối 1,% Thay đổi 1,Lần cuối 3,Lần cuối 7,MA 7,MA 30,Std dev 7,RSI 14,Ngày thứ,Tháng thứ,Quý thứ,Năm thứ,Ngày
Ngày,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1994-09-03,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10987.0,10982.4,2.645751,100.0,5,9,3,1994,1994-09-03
1994-09-04,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10981.0,10988.0,10982.633333,0.0,100.0,6,9,3,1994,1994-09-04
1994-09-05,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10982.866667,0.0,100.0,0,9,3,1994,1994-09-05
1994-09-06,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.1,0.0,100.0,1,9,3,1994,1994-09-06
1994-09-07,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.333333,0.0,100.0,2,9,3,1994,1994-09-07
1994-09-08,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.566667,0.0,100.0,3,9,3,1994,1994-09-08
1994-09-09,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10983.8,0.0,100.0,4,9,3,1994,1994-09-09
1994-09-10,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10984.033333,0.0,100.0,5,9,3,1994,1994-09-10
1994-09-11,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10984.266667,0.0,100.0,6,9,3,1994,1994-09-11
1994-09-12,10988.0,10988.0,10988.0,10988.0,0.0,10988.0,0.0,10988.0,10988.0,10988.0,10984.5,0.0,50.0,0,9,3,1994,1994-09-12
