**DATA PRE-PROCESSING**

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [5]:
import yfinance as yf

In [6]:
symbol = 'AMD'
start = '2007-01-01'
end = '2024-12-31'

dataset = yf.download(symbol,start,end)
dataset.head()

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,AMD,AMD,AMD,AMD,AMD
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2007-01-03,19.52,20.4,19.35,20.08,28350300
2007-01-04,19.790001,19.860001,19.32,19.66,23652500
2007-01-05,19.709999,19.91,19.540001,19.540001,15902400
2007-01-08,19.469999,19.860001,19.370001,19.709999,15814800
2007-01-09,19.65,19.709999,19.370001,19.450001,14494200


In [7]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'], 1, 0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'], 1, 0)
dataset['Buy_Sell'] = np.where(dataset['Close'].shift(-1) > dataset['Close'], 1, 0)
dataset['Returns'] = dataset['Close'].pct_change()
dataset = dataset.dropna()
dataset.head()

Price,Close,High,Low,Open,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Ticker,AMD,AMD,AMD,AMD,AMD,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2007-01-04,19.790001,19.860001,19.32,19.66,23652500,0,0,0,0.013832
2007-01-05,19.709999,19.91,19.540001,19.540001,15902400,0,1,0,-0.004043
2007-01-08,19.469999,19.860001,19.370001,19.709999,15814800,0,0,1,-0.012177
2007-01-09,19.65,19.709999,19.370001,19.450001,14494200,1,1,1,0.009245
2007-01-10,20.01,20.02,19.5,19.639999,19783200,1,1,1,0.018321


In [9]:
X = dataset[['Open', 'High', 'Low', 'Volume']].values
y = dataset['Close'].values

**RESCALING DATA**

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_x = scaler.fit_transform(X)
np.set_printoptions(precision=3) #Setting precision for the output
rescaled_x[0:5, :]

array([[0.085, 0.081, 0.086, 0.073],
       [0.085, 0.081, 0.087, 0.049],
       [0.085, 0.081, 0.086, 0.049],
       [0.084, 0.08 , 0.086, 0.045],
       [0.085, 0.081, 0.087, 0.061]])

***NORMALIZING THE DATA***

In [12]:
from sklearn.preprocessing import Normalizer

In [13]:
scaler = Normalizer().fit(X)
normalized_x = scaler.transform(X)
normalized_x[0:5, :]


array([[8.312e-07, 8.397e-07, 8.168e-07, 1.000e+00],
       [1.229e-06, 1.252e-06, 1.229e-06, 1.000e+00],
       [1.246e-06, 1.256e-06, 1.225e-06, 1.000e+00],
       [1.342e-06, 1.360e-06, 1.336e-06, 1.000e+00],
       [9.928e-07, 1.012e-06, 9.857e-07, 1.000e+00]])

***BINARIZING THE DATA***

In [14]:
from sklearn.preprocessing import Binarizer

In [15]:
Binarizer = Binarizer(threshold=0.0).fit(X)
binary_x = Binarizer.transform(X)
binary_x[0:5, :]

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

***MEAN REMOVAL***

In [17]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler = StandardScaler(with_mean=True, with_std=False) #Only mean removal, no std scaling
scaled_x = scaler.fit_transform(X)
scaled_x[0:5, :]

array([[-1.567e+01, -1.616e+01, -1.527e+01, -1.916e+07],
       [-1.579e+01, -1.611e+01, -1.505e+01, -2.691e+07],
       [-1.562e+01, -1.616e+01, -1.522e+01, -2.700e+07],
       [-1.588e+01, -1.631e+01, -1.522e+01, -2.832e+07],
       [-1.569e+01, -1.600e+01, -1.509e+01, -2.303e+07]])

***ONE HOT ENCODING***

In [20]:
from sklearn.preprocessing import OneHotEncoder


In [25]:
encoder = OneHotEncoder()
encoder.fit(X)


***LABEL ENCODING***

In [26]:
from sklearn.preprocessing import LabelEncoder

In [28]:
label_encoder = LabelEncoder()
input_classes = ['Apple', 'Intel', 'Microsoft', 'Google', 'Tesla'] #we will use company names
label_encoder.fit_transform(input_classes)

array([0, 2, 3, 1, 4])

In [29]:
for i, companies in enumerate(label_encoder.classes_):
    print(companies,'-->', i)

Apple --> 0
Google --> 1
Intel --> 2
Microsoft --> 3
Tesla --> 4


In [31]:
labels = ['Apple', 'Intel', 'Microsoft']
label_encoder.transform(labels)

array([0, 2, 3])

In [32]:
label_encoder.inverse_transform(label_encoder.transform(labels))

array(['Apple', 'Intel', 'Microsoft'], dtype='<U9')

***DictVectorizor***

In [33]:
from sklearn.feature_extraction import DictVectorizer

In [38]:
companies = [{'Apple': 180.25, 'Intel': 45.30, 'Microsoft': 30.26, 'Google': 203.75, 'Tesla': 302.18}] #will use company names
vec = DictVectorizer()
vec.fit_transform(companies).toarray()

array([[180.25, 203.75,  45.3 ,  30.26, 302.18]])

In [39]:
vec.get_feature_names_out()

array(['Apple', 'Google', 'Intel', 'Microsoft', 'Tesla'], dtype=object)

***POLYNOMIAL FEATURES***

In [40]:
from sklearn.preprocessing import PolynomialFeatures

In [41]:
poly = PolynomialFeatures(2)
poly.fit_transform(X)

array([[1.000e+00, 1.966e+01, 1.986e+01, ..., 3.733e+02, 4.570e+08,
        5.594e+14],
       [1.000e+00, 1.954e+01, 1.991e+01, ..., 3.818e+02, 3.107e+08,
        2.529e+14],
       [1.000e+00, 1.971e+01, 1.986e+01, ..., 3.752e+02, 3.063e+08,
        2.501e+14],
       ...,
       [1.000e+00, 1.255e+02, 1.273e+02, ..., 1.564e+04, 3.126e+09,
        6.249e+14],
       [1.000e+00, 1.244e+02, 1.262e+02, ..., 1.495e+04, 4.025e+09,
        1.084e+15],
       [1.000e+00, 1.236e+02, 1.241e+02, ..., 1.497e+04, 3.732e+09,
        9.303e+14]], shape=(4528, 15))

In [42]:
poly = PolynomialFeatures(interaction_only= True)
poly.fit_transform(X)

array([[1.000e+00, 1.966e+01, 1.986e+01, ..., 3.837e+02, 4.697e+08,
        4.570e+08],
       [1.000e+00, 1.954e+01, 1.991e+01, ..., 3.890e+02, 3.166e+08,
        3.107e+08],
       [1.000e+00, 1.971e+01, 1.986e+01, ..., 3.847e+02, 3.141e+08,
        3.063e+08],
       ...,
       [1.000e+00, 1.255e+02, 1.273e+02, ..., 1.592e+04, 3.182e+09,
        3.126e+09],
       [1.000e+00, 1.244e+02, 1.262e+02, ..., 1.543e+04, 4.154e+09,
        4.025e+09],
       [1.000e+00, 1.236e+02, 1.241e+02, ..., 1.518e+04, 3.785e+09,
        3.732e+09]], shape=(4528, 11))

***IMPUTER***

In [45]:
from sklearn.impute import SimpleImputer

In [47]:
imputer = SimpleImputer()
print(imputer.fit_transform(X, y))

[[1.966e+01 1.986e+01 1.932e+01 2.365e+07]
 [1.954e+01 1.991e+01 1.954e+01 1.590e+07]
 [1.971e+01 1.986e+01 1.937e+01 1.581e+07]
 ...
 [1.255e+02 1.273e+02 1.251e+02 2.500e+07]
 [1.244e+02 1.262e+02 1.223e+02 3.292e+07]
 [1.236e+02 1.241e+02 1.223e+02 3.050e+07]]
