### Loading the data


In [1]:
import pandas as pd

In [2]:
path = '/content/drive/MyDrive/praksa_manu/distilroberta_model_training/predicted.csv'

data = pd.read_csv(path)

data.head(10)

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Close,Predictions
0,9/29/2022,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",268.209992,neutral
1,9/29/2022,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",268.209992,neutral
2,9/29/2022,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",268.209992,neutral
3,9/29/2022,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",268.209992,neutral
4,9/29/2022,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",268.209992,neutral
5,9/29/2022,@RealDanODowd @Tesla This is you https://t.co/...,TSLA,"Tesla, Inc.",268.209992,neutral
6,9/29/2022,For years @WholeMarsBlog viciously silenced @T...,TSLA,"Tesla, Inc.",268.209992,neutral
7,9/29/2022,$NIO just because I'm down money doesn't mean ...,TSLA,"Tesla, Inc.",268.209992,positive
8,9/29/2022,50 likes for some $SPY $TSLA charts to study!\...,TSLA,"Tesla, Inc.",268.209992,neutral
9,9/29/2022,@MrJames__321 @KellyRoofing @TeslaSolar @elonm...,TSLA,"Tesla, Inc.",268.209992,neutral


In [3]:
data.rename(columns={"Predictions": "Category"}, inplace=True)

### Exapnding feature domain
We will implement lags for the Close column for previous 5 days.

In [5]:
for i in range(1,6):
  data[f'Close_lag_{i}'] = data['Close'].shift(i)

data.head(10)

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Close,Category,Close_lag_1,Close_lag_2,Close_lag_3,Close_lag_4,Close_lag_5
0,9/29/2022,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",268.209992,neutral,,,,,
1,9/29/2022,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",268.209992,neutral,268.209992,,,,
2,9/29/2022,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",268.209992,neutral,268.209992,268.209992,,,
3,9/29/2022,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",268.209992,neutral,268.209992,268.209992,268.209992,,
4,9/29/2022,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",268.209992,neutral,268.209992,268.209992,268.209992,268.209992,
5,9/29/2022,@RealDanODowd @Tesla This is you https://t.co/...,TSLA,"Tesla, Inc.",268.209992,neutral,268.209992,268.209992,268.209992,268.209992,268.209992
6,9/29/2022,For years @WholeMarsBlog viciously silenced @T...,TSLA,"Tesla, Inc.",268.209992,neutral,268.209992,268.209992,268.209992,268.209992,268.209992
7,9/29/2022,$NIO just because I'm down money doesn't mean ...,TSLA,"Tesla, Inc.",268.209992,positive,268.209992,268.209992,268.209992,268.209992,268.209992
8,9/29/2022,50 likes for some $SPY $TSLA charts to study!\...,TSLA,"Tesla, Inc.",268.209992,neutral,268.209992,268.209992,268.209992,268.209992,268.209992
9,9/29/2022,@MrJames__321 @KellyRoofing @TeslaSolar @elonm...,TSLA,"Tesla, Inc.",268.209992,neutral,268.209992,268.209992,268.209992,268.209992,268.209992


In [6]:
data = data.dropna()

In [7]:
data.isnull().sum()

Unnamed: 0,0
Date,0
Tweet,0
Stock Name,0
Company Name,0
Close,0
Category,0
Close_lag_1,0
Close_lag_2,0
Close_lag_3,0
Close_lag_4,0


### Applying date transformation

We will split the values of the 'Date' column into month and year to have better approach. Then we will apply sine and cosine transformation to this values.

In [None]:
import numpy as np

In [None]:
def split_date(row):
    month, day, year = row['Date'].split('/')

    return pd.Series({'Month': int(month), 'Year': int(year)})

In [None]:
data[['Month', 'Year']] = data.apply(split_date, axis=1)

In [None]:
data.head(10)

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Close,Predictions,Month,Year
0,9/29/2022,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022
1,9/29/2022,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022
2,9/29/2022,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022
3,9/29/2022,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022
4,9/29/2022,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",268.209992,neutral,9,2022
5,9/29/2022,@RealDanODowd @Tesla This is you https://t.co/...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022
6,9/29/2022,For years @WholeMarsBlog viciously silenced @T...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022
7,9/29/2022,$NIO just because I'm down money doesn't mean ...,TSLA,"Tesla, Inc.",268.209992,positive,9,2022
8,9/29/2022,50 likes for some $SPY $TSLA charts to study!\...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022
9,9/29/2022,@MrJames__321 @KellyRoofing @TeslaSolar @elonm...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022


In [None]:
def sine_transform(value, max_value):

    return np.sin(2 * np.pi * value / max_value)

In [None]:
def cosine_transform(value, max_value):

    return np.cos(2 * np.pi * value / max_value)

In [None]:
data['Month_Sin'] = data['Month'].apply(lambda x: sine_transform(x, 12))

In [None]:
data['Month_Cos'] = data['Month'].apply(lambda x: cosine_transform(x, 12))

In [None]:
max_year = data['Year'].max()

data['Year_Sin'] = data['Year'].apply(lambda x: sine_transform(x, max_year))

In [None]:
data['Year_Cos'] = data['Year'].apply(lambda x: cosine_transform(x, max_year))

In [None]:
data.head(10)

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Close,Predictions,Month,Year,Month_Sin,Month_Cos,Year_Sin,Year_Cos
0,9/29/2022,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0
1,9/29/2022,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0
2,9/29/2022,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0
3,9/29/2022,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0
4,9/29/2022,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",268.209992,neutral,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0
5,9/29/2022,@RealDanODowd @Tesla This is you https://t.co/...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0
6,9/29/2022,For years @WholeMarsBlog viciously silenced @T...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0
7,9/29/2022,$NIO just because I'm down money doesn't mean ...,TSLA,"Tesla, Inc.",268.209992,positive,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0
8,9/29/2022,50 likes for some $SPY $TSLA charts to study!\...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0
9,9/29/2022,@MrJames__321 @KellyRoofing @TeslaSolar @elonm...,TSLA,"Tesla, Inc.",268.209992,neutral,9,2022,-1.0,-1.83697e-16,-2.449294e-16,1.0


To avoid redundancy we will drop columns 'Date', 'Month' and 'Year'.

In [None]:
data.drop(columns=['Date', 'Year', 'Month'], inplace=True)

data.head(10)

Unnamed: 0,Tweet,Stock Name,Company Name,Close,Predictions,Month_Sin,Month_Cos,Year_Sin,Year_Cos
0,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0
1,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0
2,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0
3,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0
4,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0
5,@RealDanODowd @Tesla This is you https://t.co/...,TSLA,"Tesla, Inc.",268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0
6,For years @WholeMarsBlog viciously silenced @T...,TSLA,"Tesla, Inc.",268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0
7,$NIO just because I'm down money doesn't mean ...,TSLA,"Tesla, Inc.",268.209992,positive,-1.0,-1.83697e-16,-2.449294e-16,1.0
8,50 likes for some $SPY $TSLA charts to study!\...,TSLA,"Tesla, Inc.",268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0
9,@MrJames__321 @KellyRoofing @TeslaSolar @elonm...,TSLA,"Tesla, Inc.",268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0


### Converting features

Because in the previous tasks we added sentiment column while preprocessing the 'Tweet' column and applying the sentiment to the datset in this task we do not need it.

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [None]:
data.drop(columns=['Tweet'], inplace=True)

In [None]:
len(data['Stock Name'].unique())

25

In [None]:
sn_encoder = LabelEncoder()

In [None]:
data['Stock Name'] = sn_encoder.fit_transform(data['Stock Name'])

In [None]:
sn_scaler = MinMaxScaler()

data['Stock Name'] = sn_scaler.fit_transform(data[['Stock Name']])

In [None]:
data.head(10)

Unnamed: 0,Stock Name,Company Name,Close,Category,Month_Sin,Month_Cos,Year_Sin,Year_Cos,Category_encoded
0,0.833333,18,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
1,0.833333,18,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
2,0.833333,18,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
3,0.833333,18,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
4,0.833333,18,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
5,0.833333,18,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
6,0.833333,18,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
7,0.833333,18,268.209992,positive,-1.0,-1.83697e-16,-2.449294e-16,1.0,1
8,0.833333,18,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
9,0.833333,18,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0


In [None]:
len(data['Company Name'].unique())

25

In [None]:
cn_encoder = LabelEncoder()

In [None]:
data['Company Name'] = cn_encoder.fit_transform(data['Company Name'])

In [None]:
cn_scaler = MinMaxScaler()

data['Company Name'] = cn_scaler.fit_transform(data[['Company Name']])

In [None]:
data.head(10)

Unnamed: 0,Stock Name,Company Name,Close,Category,Month_Sin,Month_Cos,Year_Sin,Year_Cos,Category_encoded
0,0.833333,0.75,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
1,0.833333,0.75,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
2,0.833333,0.75,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
3,0.833333,0.75,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
4,0.833333,0.75,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
5,0.833333,0.75,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
6,0.833333,0.75,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
7,0.833333,0.75,268.209992,positive,-1.0,-1.83697e-16,-2.449294e-16,1.0,1
8,0.833333,0.75,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0
9,0.833333,0.75,268.209992,neutral,-1.0,-1.83697e-16,-2.449294e-16,1.0,0


In [None]:
close_scaler = MinMaxScaler()

data['Close'] = close_scaler.fit_transform(data[['Close']])

In [None]:
def encode_sentiment(row):
    if row['Category'] == 'positive':
        return 1
    elif row['Category'] == 'negative':
        return -1
    else:
        return 0


data['Category'] = data.apply(encode_sentiment, axis=1)

In [None]:
data.head(10)

Unnamed: 0,Stock Name,Company Name,Close,Category,Month_Sin,Month_Cos,Year_Sin,Year_Cos
0,0.833333,0.75,0.377748,0,-1.0,-1.83697e-16,-2.449294e-16,1.0
1,0.833333,0.75,0.377748,0,-1.0,-1.83697e-16,-2.449294e-16,1.0
2,0.833333,0.75,0.377748,0,-1.0,-1.83697e-16,-2.449294e-16,1.0
3,0.833333,0.75,0.377748,0,-1.0,-1.83697e-16,-2.449294e-16,1.0
4,0.833333,0.75,0.377748,0,-1.0,-1.83697e-16,-2.449294e-16,1.0
5,0.833333,0.75,0.377748,0,-1.0,-1.83697e-16,-2.449294e-16,1.0
6,0.833333,0.75,0.377748,0,-1.0,-1.83697e-16,-2.449294e-16,1.0
7,0.833333,0.75,0.377748,1,-1.0,-1.83697e-16,-2.449294e-16,1.0
8,0.833333,0.75,0.377748,0,-1.0,-1.83697e-16,-2.449294e-16,1.0
9,0.833333,0.75,0.377748,0,-1.0,-1.83697e-16,-2.449294e-16,1.0


### Dividing dataset into subsets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X, y = data.drop(columns=['Close']), data['Close']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Finding best model

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [None]:
model = xgb.XGBRegressor(objective='reg:squarederror')

In [None]:
params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'alpha': 0.1,
    'lambda': 1,
    'min_child_weight': 1,
    'gamma': 0,
}

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=params, cv=3)

### Training version: 1
In this version of the training we will train the dataset with splitting the dataset into subsets, with disabled shuffle on train_test_split object. After training the models both way we will compare the results.

### Training version: 2
We will enable the suffling on the trianing and test subsets.