In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load your data
data = pd.read_csv('/content/train.csv')  # Replace 'your_data.csv' with your dataset
dt = pd.read_csv('/content/test.csv')

In [3]:
dt.shape

(100, 4)

In [4]:
data.head()

Unnamed: 0,id,Date,Open,Close,Volume,Strategy
0,0,2015-06-01,66.208486,75.609978,298506300,Hold
1,1,2015-06-08,64.116235,74.443331,227974800,Hold
2,2,2015-06-15,47.701942,71.140831,250670900,Buy
3,3,2015-06-22,54.754816,72.881344,223614300,Hold
4,4,2015-06-29,48.031899,66.284718,406814900,Buy


In [5]:
ffg = pd.DataFrame()
ffg['Opn'] = data.Open
ffg

Unnamed: 0,Opn
0,66.208486
1,64.116235
2,47.701942
3,54.754816
4,48.031899
...,...
295,81.284821
296,71.970249
297,71.784627
298,69.372333


In [8]:
import pandas as pd

def feature_eng(data):
    result = pd.DataFrame()
    result['Open'] = data.Open

    num_lags = 5  # Choose the number of lag features
    for i in range(1, num_lags + 1):
        result[f'Open_Lag_{i}'] = data['Open'].shift(i)

    # Short-term Moving Average (e.g., 5 days)
    result['Open_Short_Moving_Avg'] = data['Open'].rolling(window=5).mean()

    # Long-term Moving Average (e.g., 20 days)
    result['Open_Long_Moving_Avg'] = data['Open'].rolling(window=20).mean()

    # Exponential Moving Averages (EMAs) with different alpha values
    for alpha in [0.1, 0.2, 0.5]:  # You can choose different smoothing factors (alpha)
        short_term_ema_col = f'Open_Short_Term_EMA_{alpha}'
        medium_term_ema_col = f'Open_Medium_Term_EMA_{alpha}'
        long_term_ema_col = f'Open_Long_Term_EMA_{alpha}'

        result[short_term_ema_col] = data['Open'].ewm(alpha=alpha, adjust=False).mean()
        result[medium_term_ema_col] = data['Open'].ewm(alpha=alpha, adjust=False).mean()
        result[long_term_ema_col] = data['Open'].ewm(alpha=alpha, adjust=False).mean()

    return result


In [9]:
# # Short-term Moving Average (e.g., 5 days)
#       data['Open_Short_Moving_Avg'] = data['Open'].rolling(window=5).mean()

# # Long-term Moving Average (e.g., 20 days)
#       data['Open_Long_Moving_Avg'] = data['Open'].rolling(window=20).mean()

In [10]:
# def calculate_rsi(data, window=14):
#     delta = data['Close'].diff()

#     gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
#     loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()

#     rs = gain / loss
#     data['RSI'] = 100 - (100 / (1 + rs))

# data['Open_rsi'] = calculate_rsi(data)

In [11]:
#       data['Volume_Change'] = data['Volume'].diff()

# # Percentage change in volume
#       data['Volume_Percent_Change'] = data['Volume'].pct_change() * 100
#       result = data

In [12]:
result_data = feature_eng(data)
result_data

Unnamed: 0,Open,Open_Lag_1,Open_Lag_2,Open_Lag_3,Open_Lag_4,Open_Lag_5,Open_Short_Moving_Avg,Open_Long_Moving_Avg,Open_Short_Term_EMA_0.1,Open_Medium_Term_EMA_0.1,Open_Long_Term_EMA_0.1,Open_Short_Term_EMA_0.2,Open_Medium_Term_EMA_0.2,Open_Long_Term_EMA_0.2,Open_Short_Term_EMA_0.5,Open_Medium_Term_EMA_0.5,Open_Long_Term_EMA_0.5
0,66.208486,,,,,,,,66.208486,66.208486,66.208486,66.208486,66.208486,66.208486,66.208486,66.208486,66.208486
1,64.116235,66.208486,,,,,,,65.999261,65.999261,65.999261,65.790036,65.790036,65.790036,65.162361,65.162361,65.162361
2,47.701942,64.116235,66.208486,,,,,,64.169529,64.169529,64.169529,62.172417,62.172417,62.172417,56.432151,56.432151,56.432151
3,54.754816,47.701942,64.116235,66.208486,,,,,63.228058,63.228058,63.228058,60.688897,60.688897,60.688897,55.593484,55.593484,55.593484
4,48.031899,54.754816,47.701942,64.116235,66.208486,,56.162676,,61.708442,61.708442,61.708442,58.157497,58.157497,58.157497,51.812692,51.812692,51.812692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,81.284821,80.663736,79.840215,80.490487,81.278395,80.361799,80.711531,78.167510,78.870477,78.870477,78.870477,80.030761,80.030761,80.030761,80.850407,80.850407,80.850407
296,71.970249,81.284821,80.663736,79.840215,80.490487,81.278395,78.849902,78.145700,78.180454,78.180454,78.180454,78.418659,78.418659,78.418659,76.410328,76.410328,76.410328
297,71.784627,71.970249,81.284821,80.663736,79.840215,80.490487,77.108730,77.932409,77.540872,77.540872,77.540872,77.091852,77.091852,77.091852,74.097478,74.097478,74.097478
298,69.372333,71.784627,71.970249,81.284821,80.663736,79.840215,75.015154,77.871039,76.724018,76.724018,76.724018,75.547949,75.547949,75.547949,71.734906,71.734906,71.734906


In [13]:
X = result_data[['Open', 'Open_Lag_1', 'Open_Lag_2', 'Open_Lag_3', 'Open_Lag_4', 'Open_Lag_5', 'Open_Short_Moving_Avg', 'Open_Long_Moving_Avg', 'Open_Short_Term_EMA_0.1', 'Open_Medium_Term_EMA_0.1', 'Open_Long_Term_EMA_0.1', 'Open_Short_Term_EMA_0.2', 'Open_Medium_Term_EMA_0.2', 'Open_Long_Term_EMA_0.2', 'Open_Short_Term_EMA_0.5', 'Open_Medium_Term_EMA_0.5', 'Open_Long_Term_EMA_0.5']]

X_cls = result_data[['Open', 'Open_Lag_1', 'Open_Lag_2', 'Open_Lag_3', 'Open_Lag_4', 'Open_Lag_5', 'Open_Short_Moving_Avg', 'Open_Long_Moving_Avg', 'Open_Short_Term_EMA_0.1', 'Open_Medium_Term_EMA_0.1', 'Open_Long_Term_EMA_0.1', 'Open_Short_Term_EMA_0.2', 'Open_Medium_Term_EMA_0.2', 'Open_Long_Term_EMA_0.2', 'Open_Short_Term_EMA_0.5', 'Open_Medium_Term_EMA_0.5', 'Open_Long_Term_EMA_0.5']]
X_cls['Close'] = data['Close']

y_reg = data['Close']
y_cls = data['Strategy']  # Target variable for classification (Buy, Sell, Hold)


In [14]:
X.head()

Unnamed: 0,Open,Open_Lag_1,Open_Lag_2,Open_Lag_3,Open_Lag_4,Open_Lag_5,Open_Short_Moving_Avg,Open_Long_Moving_Avg,Open_Short_Term_EMA_0.1,Open_Medium_Term_EMA_0.1,Open_Long_Term_EMA_0.1,Open_Short_Term_EMA_0.2,Open_Medium_Term_EMA_0.2,Open_Long_Term_EMA_0.2,Open_Short_Term_EMA_0.5,Open_Medium_Term_EMA_0.5,Open_Long_Term_EMA_0.5
0,66.208486,,,,,,,,66.208486,66.208486,66.208486,66.208486,66.208486,66.208486,66.208486,66.208486,66.208486
1,64.116235,66.208486,,,,,,,65.999261,65.999261,65.999261,65.790036,65.790036,65.790036,65.162361,65.162361,65.162361
2,47.701942,64.116235,66.208486,,,,,,64.169529,64.169529,64.169529,62.172417,62.172417,62.172417,56.432151,56.432151,56.432151
3,54.754816,47.701942,64.116235,66.208486,,,,,63.228058,63.228058,63.228058,60.688897,60.688897,60.688897,55.593484,55.593484,55.593484
4,48.031899,54.754816,47.701942,64.116235,66.208486,,56.162676,,61.708442,61.708442,61.708442,58.157497,58.157497,58.157497,51.812692,51.812692,51.812692


In [15]:
# Splitting the data into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, shuffle=False)

# Splitting the data into training and testing sets for classification
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.2, shuffle=False)

In [16]:
from xgboost import XGBRegressor

# Create the XGBoost regression model with specified hyperparameters
reg_model = XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100)

# Fit the model with training data
reg_model.fit(X_train_reg, y_train_reg)


In [17]:
predictionss_df = pd.DataFrame(index=X_test_reg.index)
# predictionss_df['Close', 'predicted']
predictionss_df['Close'] = y_test_reg
predictionss_df['predicted'] = reg_model.predict(X_test_reg)
predictionss_df


Unnamed: 0,Close,predicted
240,71.365903,73.834534
241,74.212029,74.115425
242,75.110591,76.982407
243,79.059896,78.912079
244,86.154937,82.048805
245,82.494413,82.386948
246,84.26427,83.548492
247,81.916434,82.962639
248,81.487762,82.890823
249,78.500217,82.402428


In [18]:
# reg_predictions = reg_model.predict(X_test_reg)
# reg_predictions
# predictions = ['Actual', 'Predicted']
# predictions['Actual'] =

In [19]:
smape = np.mean(np.abs(predictionss_df['predicted'] - predictionss_df['Close']) / (np.abs(predictionss_df['predicted']) + np.abs(predictionss_df['Close'])))

In [20]:
smape

0.011270186842427506

In [21]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the string labels to numerical labels
y_train_cls_encoded = label_encoder.fit_transform(y_train_cls)

# Create an XGBoost classifier
cls_model = XGBClassifier()

# Define a parameter grid to search for the best hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],  # You can adjust the number of trees
    'max_depth': [3, 4, 5],  # You can adjust the maximum depth of the trees
    'learning_rate': [0.1, 0.01, 0.001],  # You can adjust the learning rate
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=cls_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train_cls, y_train_cls_encoded)

# Get the best parameters
best_params = grid_search.best_params_
best_cls_model = grid_search.best_estimator_

# Now, best_cls_model is the XGBoost Classifier with the best hyperparameters


Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [22]:
# Initialize LabelEncoder
# Fit the XGBoost Classifier with the best hyperparameters
best_cls_model.fit(X_train_cls, y_train_cls_encoded)

# Now, best_cls_model is the XGBoost Classifier with the best hyperparameters, and it's trained on your training data


In [23]:
X_test_cls.head()

Unnamed: 0,Open,Open_Lag_1,Open_Lag_2,Open_Lag_3,Open_Lag_4,Open_Lag_5,Open_Short_Moving_Avg,Open_Long_Moving_Avg,Open_Short_Term_EMA_0.1,Open_Medium_Term_EMA_0.1,Open_Long_Term_EMA_0.1,Open_Short_Term_EMA_0.2,Open_Medium_Term_EMA_0.2,Open_Long_Term_EMA_0.2,Open_Short_Term_EMA_0.5,Open_Medium_Term_EMA_0.5,Open_Long_Term_EMA_0.5,Close
240,74.811495,71.10802,67.967818,66.278431,71.841477,70.289227,70.401448,79.180577,76.280135,76.280135,76.280135,72.857034,72.857034,72.857034,72.302289,72.302289,72.302289,71.365903
241,80.812144,74.811495,71.10802,67.967818,66.278431,71.841477,72.195581,78.716735,76.733336,76.733336,76.733336,74.448056,74.448056,74.448056,76.557216,76.557216,76.557216,74.212029
242,82.002162,80.812144,74.811495,71.10802,67.967818,66.278431,75.340328,78.095141,77.260218,77.260218,77.260218,75.958877,75.958877,75.958877,79.279689,79.279689,79.279689,75.110591
243,81.211918,82.002162,80.812144,74.811495,71.10802,67.967818,77.989148,77.730771,77.655388,77.655388,77.655388,77.009485,77.009485,77.009485,80.245803,80.245803,80.245803,79.059896
244,79.468382,81.211918,82.002162,80.812144,74.811495,71.10802,79.66122,77.359884,77.836687,77.836687,77.836687,77.501265,77.501265,77.501265,79.857093,79.857093,79.857093,86.154937


In [24]:
cls_predictions = best_cls_model.predict(X_test_cls)
res = []
for i in cls_predictions:
    res.append(i)

for i in range(len(res)):
    if res[i] == 0:
        res[i] = 'Hold'

    elif res[i] == 1:
        res[i] = 'Buy'

    elif res[i] == 2:
        res[i] = 'Sell'
cls_predictions = res
cls_predictions

['Buy',
 'Buy',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Buy',
 'Hold',
 'Hold',
 'Hold',
 'Hold',
 'Hold',
 'Hold',
 'Buy',
 'Hold',
 'Hold',
 'Hold',
 'Hold',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Buy',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell',
 'Sell']

In [25]:
# Evaluate the classification model
accuracy = accuracy_score(y_test_cls, cls_predictions)
print(f'Classification Accuracy: {accuracy}')


Classification Accuracy: 0.3333333333333333


In [26]:
dt.head()

Unnamed: 0,id,Date,Open,Volume
0,0,2021-03-01,72.708331,205897600
1,1,2021-03-08,73.310886,214095600
2,2,2021-03-15,70.610119,151601700
3,3,2021-03-22,70.256017,193982500
4,4,2021-03-29,68.258324,220238400


In [27]:
result_dt = feature_eng(dt)
result_dt

Unnamed: 0,Open,Open_Lag_1,Open_Lag_2,Open_Lag_3,Open_Lag_4,Open_Lag_5,Open_Short_Moving_Avg,Open_Long_Moving_Avg,Open_Short_Term_EMA_0.1,Open_Medium_Term_EMA_0.1,Open_Long_Term_EMA_0.1,Open_Short_Term_EMA_0.2,Open_Medium_Term_EMA_0.2,Open_Long_Term_EMA_0.2,Open_Short_Term_EMA_0.5,Open_Medium_Term_EMA_0.5,Open_Long_Term_EMA_0.5
0,72.708331,,,,,,,,72.708331,72.708331,72.708331,72.708331,72.708331,72.708331,72.708331,72.708331,72.708331
1,73.310886,72.708331,,,,,,,72.768587,72.768587,72.768587,72.828842,72.828842,72.828842,73.009609,73.009609,73.009609
2,70.610119,73.310886,72.708331,,,,,,72.552740,72.552740,72.552740,72.385097,72.385097,72.385097,71.809864,71.809864,71.809864
3,70.256017,70.610119,73.310886,72.708331,,,,,72.323068,72.323068,72.323068,71.959281,71.959281,71.959281,71.032940,71.032940,71.032940
4,68.258324,70.256017,70.610119,73.310886,72.708331,,71.028735,,71.916593,71.916593,71.916593,71.219090,71.219090,71.219090,69.645632,69.645632,69.645632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,57.048622,55.184092,58.417757,59.428165,57.967017,60.016358,57.609131,57.873293,58.740949,58.740949,58.740949,57.191260,57.191260,57.191260,56.910196,56.910196,56.910196
96,60.569661,57.048622,55.184092,58.417757,59.428165,57.967017,58.129659,57.985505,58.923820,58.923820,58.923820,57.866940,57.866940,57.866940,58.739928,58.739928,58.739928
97,61.446777,60.569661,57.048622,55.184092,58.417757,59.428165,58.533382,58.322605,59.176116,59.176116,59.176116,58.582908,58.582908,58.582908,60.093353,60.093353,60.093353
98,55.447711,61.446777,60.569661,57.048622,55.184092,58.417757,57.939373,58.457118,58.803275,58.803275,58.803275,57.955868,57.955868,57.955868,57.770532,57.770532,57.770532


Training Predictions

In [28]:
result_dt.head()

Unnamed: 0,Open,Open_Lag_1,Open_Lag_2,Open_Lag_3,Open_Lag_4,Open_Lag_5,Open_Short_Moving_Avg,Open_Long_Moving_Avg,Open_Short_Term_EMA_0.1,Open_Medium_Term_EMA_0.1,Open_Long_Term_EMA_0.1,Open_Short_Term_EMA_0.2,Open_Medium_Term_EMA_0.2,Open_Long_Term_EMA_0.2,Open_Short_Term_EMA_0.5,Open_Medium_Term_EMA_0.5,Open_Long_Term_EMA_0.5
0,72.708331,,,,,,,,72.708331,72.708331,72.708331,72.708331,72.708331,72.708331,72.708331,72.708331,72.708331
1,73.310886,72.708331,,,,,,,72.768587,72.768587,72.768587,72.828842,72.828842,72.828842,73.009609,73.009609,73.009609
2,70.610119,73.310886,72.708331,,,,,,72.55274,72.55274,72.55274,72.385097,72.385097,72.385097,71.809864,71.809864,71.809864
3,70.256017,70.610119,73.310886,72.708331,,,,,72.323068,72.323068,72.323068,71.959281,71.959281,71.959281,71.03294,71.03294,71.03294
4,68.258324,70.256017,70.610119,73.310886,72.708331,,71.028735,,71.916593,71.916593,71.916593,71.21909,71.21909,71.21909,69.645632,69.645632,69.645632


In [29]:
# X_dt = dt[['Open', 'Volume']]
reg_predictions_2 = reg_model.predict(result_dt)
reg_predictions_2

array([76.90455 , 76.00187 , 73.48294 , 74.266365, 73.95649 , 73.72741 ,
       73.84626 , 73.725365, 73.725365, 73.23008 , 74.222694, 73.725365,
       73.84626 , 73.84626 , 74.33781 , 73.725365, 73.72741 , 73.72741 ,
       73.72741 , 74.206726, 73.45047 , 74.28511 , 75.399284, 76.71096 ,
       78.5508  , 79.48032 , 81.74814 , 82.89082 , 82.89082 , 82.89082 ,
       83.04124 , 82.55268 , 82.40581 , 82.89082 , 83.52963 , 85.32492 ,
       85.185905, 86.07239 , 88.92992 , 84.9624  , 89.4721  , 89.19978 ,
       83.920364, 75.763084, 74.612755, 73.996574, 74.115425, 74.115425,
       73.99453 , 74.60697 , 74.384476, 75.72693 , 74.87602 , 75.721146,
       76.98819 , 77.95665 , 79.48032 , 81.74814 , 82.89082 , 82.40243 ,
       82.55268 , 82.38695 , 85.76952 , 84.633896, 83.0102  , 77.41651 ,
       75.121284, 73.996574, 73.996574, 73.499245, 71.43705 , 71.97361 ,
       69.279434, 68.98072 , 69.077324, 69.077324, 68.75794 , 68.403   ,
       68.3788  , 66.514786, 64.23883 , 63.552032, 

In [30]:
result_dt['Close'] = reg_predictions_2
cls_predictions_2 = best_cls_model.predict(result_dt)
res_2 = []
for i in cls_predictions_2:
    res_2.append(i)

for i in range(len(res_2)):
    if res_2[i] == 0:
        res_2[i] = 'Hold'

    elif res_2[i] == 1:
        res_2[i] = 'Buy'

    elif res_2[i] == 2:
        res_2[i] = 'Sell'
cls_predictions_2 = res_2

In [31]:
# Assuming 'X_test_reg' contains the testing data for regression and 'X_test_cls' contains the testing data for classification.

# Create a DataFrame for the submission file
submission = pd.DataFrame()
submission['id'] = dt['id']
submission['Date'] = dt['Date']  # Assuming the index contains the dates in the test set
submission['Close'] = reg_predictions_2  # Predicted close prices
submission['Strategy'] = cls_predictions_2  # Predicted strategies
submission
# Convert numerical strategy predictions back to their original labels (Buy, Sell, Hold)
# Replace the encoding based on how it was originally encoded
# Example (modify according to your encoding):


Unnamed: 0,id,Date,Close,Strategy
0,0,2021-03-01,76.904549,Buy
1,1,2021-03-08,76.001869,Buy
2,2,2021-03-15,73.482941,Hold
3,3,2021-03-22,74.266365,Buy
4,4,2021-03-29,73.956490,Hold
...,...,...,...,...
95,95,2022-12-26,64.508522,Sell
96,96,2023-01-02,64.089035,Sell
97,97,2023-01-09,63.435783,Sell
98,98,2023-01-16,64.089035,Buy


In [32]:
# Save the DataFrame to a CSV file
output_filename = "submission5.csv"

# Save the selected data to a new CSV file
submission.to_csv(output_filename, index=False)

final_csv = pd.read_csv('submission5.csv')
display(final_csv)

Unnamed: 0,id,Date,Close,Strategy
0,0,2021-03-01,76.904550,Buy
1,1,2021-03-08,76.001870,Buy
2,2,2021-03-15,73.482940,Hold
3,3,2021-03-22,74.266365,Buy
4,4,2021-03-29,73.956490,Hold
...,...,...,...,...
95,95,2022-12-26,64.508520,Sell
96,96,2023-01-02,64.089035,Sell
97,97,2023-01-09,63.435783,Sell
98,98,2023-01-16,64.089035,Buy
