In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [47]:
# Step 1: Data Preprocessing
df1 = pd.read_csv("./data/set1.csv")
df2 = pd.read_csv("./data/set2.csv")
df3 = pd.read_csv("./data/set3.csv")

df1['date'] = pd.to_datetime(df1['date'])
df2['date'] = pd.to_datetime(df2['date'])
df3['date'] = pd.to_datetime(df3['date'])

# Set 'Date' and 'Month' columns as indices
df1.set_index(['date'], inplace=True)
df2.set_index(['date'], inplace=True)
df3.set_index(['date'], inplace=True)

                 open       high        low      close     volume   adjclose
date                                                                        
2012-07-23  21.228571  21.639286  20.989643  21.565357  487975600  18.257217
2012-07-24  21.692142  21.774286  21.375357  21.461430  565132400  18.169228
2012-07-27  20.536072  20.922501  20.413929  20.898571  403936400  17.692717
2012-07-30  21.104286  21.408571  20.993570  21.251072  379142400  17.991146
2012-07-31  21.543928  21.846430  21.525715  21.812857  462327600  18.466757
...               ...        ...        ...        ...        ...        ...
2020-01-17  79.067497  79.684998  78.750000  79.682503  137816400  77.530296
2020-01-21  79.297501  79.754997  79.000000  79.142502  110843200  77.004898
2020-01-22  79.644997  79.997498  79.327499  79.425003  101832400  77.279770
2020-01-23  79.480003  79.889999  78.912498  79.807503  104472000  77.651932
2020-01-24  80.062500  80.832497  79.379997  79.577499  146537600  77.428162

In [42]:
def preprocess(df):

    df["target"] = df.close.shift(-1)
    last_value = df['target'].iloc[-2]
    df.at[df.index[-1], 'target'] = last_value

    return df

def add_techincal(df):
    def relative_strength_idx(df, n=14):
        close = df['close']
        delta = close.diff()
        delta = delta[1:]
        pricesUp = delta.copy()
        pricesDown = delta.copy()
        pricesUp[pricesUp < 0] = 0
        pricesDown[pricesDown > 0] = 0
        rollUp = pricesUp.rolling(n).mean()
        rollDown = pricesDown.abs().rolling(n).mean()
        rs = rollUp / rollDown
        rsi = 100.0 - (100.0 / (1.0 + rs))
        return rsi

    df['EMA_9'] = df['close'].ewm(9).mean().shift()
    df['SMA_5'] = df['close'].rolling(5).mean().shift()
    df['SMA_10'] = df['close'].rolling(10).mean().shift()
    df['SMA_15'] = df['close'].rolling(15).mean().shift()
    df['SMA_30'] = df['close'].rolling(30).mean().shift()

    df['RSI'] = relative_strength_idx(df).fillna(0)
    EMA_12 = pd.Series(df['close'].ewm(span=12, min_periods=12).mean())
    EMA_26 = pd.Series(df['close'].ewm(span=26, min_periods=26).mean())
    df['MACD'] = pd.Series(EMA_12 - EMA_26)
    df['MACD_signal'] = pd.Series(df.MACD.ewm(span=9, min_periods=9).mean())
df1 = preprocess(df1)
df2 = preprocess(df2)
df3 = preprocess(df3)



In [43]:
def train(df):
    # Step 1: Split data into training and testing sets
    y = df.reset_index()['target']  # Target variable
    X = df.drop(columns=['target'])  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Step 2: Train a CatBoostRegressor model
    model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=6)

    model.fit(X_train, y_train)

    # Step 3: Make predictions on the test data
    predictions = model.predict(X_test)

    # Step 4: Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, predictions)
    print("Mean Squared Error:", mse)

    mse_test = mean_squared_error(y_test, predictions)
    print("Mean Squared Error (Test):", mse_test)
    return y_test, predictions


In [44]:
def plot(y_test, predictions):
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, predictions, color='red', label='Predicted')
    plt.plot(y_test, y_test, color='blue', linewidth=2, label='Perfect Prediction Line')
    plt.xlabel('Actual Close Price')
    plt.ylabel('Predicted Close Price')
    plt.title('Actual vs. Predicted Close Price')
    plt.legend()
    plt.show()

In [45]:
y_test1, predictions1 = train(df1)
y_test2, predictions2 = train(df2)
y_test3, predictions3 = train(df3)

0:	learn: 11.8562241	total: 3.07ms	remaining: 151ms
1:	learn: 10.8346379	total: 5.41ms	remaining: 130ms
2:	learn: 9.8502798	total: 8.13ms	remaining: 127ms
3:	learn: 8.9800968	total: 9.81ms	remaining: 113ms
4:	learn: 8.1812222	total: 11.8ms	remaining: 107ms
5:	learn: 7.4543866	total: 14.1ms	remaining: 104ms
6:	learn: 6.7958819	total: 15.6ms	remaining: 95.9ms
7:	learn: 6.1957126	total: 17.1ms	remaining: 89.8ms
8:	learn: 5.6644778	total: 18.8ms	remaining: 85.6ms
9:	learn: 5.1818674	total: 20.3ms	remaining: 81ms
10:	learn: 4.7239381	total: 23.9ms	remaining: 84.9ms
11:	learn: 4.3307754	total: 26.8ms	remaining: 85ms
12:	learn: 3.9802609	total: 28.7ms	remaining: 81.7ms
13:	learn: 3.6560523	total: 30.7ms	remaining: 78.9ms
14:	learn: 3.3537747	total: 32.9ms	remaining: 76.8ms
15:	learn: 3.0911001	total: 35ms	remaining: 74.3ms
16:	learn: 2.8450909	total: 37.2ms	remaining: 72.2ms
17:	learn: 2.6353463	total: 40.2ms	remaining: 71.5ms
18:	learn: 2.4339993	total: 41.9ms	remaining: 68.3ms
19:	learn: 2.

In [46]:
print(mean_squared_error(y_test1, predictions1))
print(mean_squared_error(y_test2, predictions2))
print(mean_squared_error(y_test3, predictions3))

0.7903373313164244
0.8736231885161355
0.8268111027951676


In [None]:
plot(y_test1, predictions1)

XGBoostError: [16:24:22] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\data\data.cc:501: Check failed: this->labels.Size() % this->num_row_ == 0 (1 vs. 0) : Incorrect size for labels.

In [None]:
plot(y_test2, predictions2)

In [None]:
plot(y_test3, predictions3)