Import all needed libraries and packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt   
from sklearn.model_selection import train_test_split, TimeSeriesSplit  #siket learn library for splitting the dataset to train and test
from sklearn.ensemble import RandomForestRegressor # package for random forest machine leaarning model
from sklearn.linear_model import LinearRegression # package for linear regression machine leaarning model
from sklearn.tree import DecisionTreeRegressor  # package for Decision Tree Regressor machine leaarning model
from sklearn.preprocessing import LabelEncoder # package for mapping the company names to from string to number
from sklearn.metrics import mean_squared_error, r2_score # package for evaluating the model


Import dataset and preprocess it then prepare the input and output data to be ready for model
Note: the dataset are collected manually from web API for the most important 10 KSA companies from the year 2023 to 2024

In [2]:
#load the dataset named BankDS using pandas CSV reader and save it on datafram
df = pd.read_csv('./BankDS.csv',sep=';') 

In [3]:
#print dataset full information columns and number of rows
print("\nColumns in the dataset:")
df.info()



Columns in the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3228 entries, 0 to 3227
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   company  3228 non-null   object 
 1   date     3228 non-null   object 
 2   price    3228 non-null   float64
dtypes: float64(1), object(2)
memory usage: 75.8+ KB


In [4]:
#Split the dataset for input and output
X= df[['company','date']]
Y= df['price']

In [5]:
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])


# Extract features from the date and create new 3 columns as input to replace regulare date format
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day




In [7]:
#show the shape of the input and output before split the date column
print("\nFeatures shape:", X.shape)
print("Target shape:", Y.shape)


Features shape: (3228, 2)
Target shape: (3228,)


In [8]:

# Encode company names and show the mapped integer values
le = LabelEncoder()
df['company_encoded'] = le.fit_transform(df['company'])
df['company_encoded']

0       11
1       11
2       11
3       11
4       11
        ..
3223     6
3224     6
3225     6
3226     6
3227     6
Name: company_encoded, Length: 3228, dtype: int32

In [9]:
# Prepare new input features and target output
features = ['company_encoded', 'year', 'month', 'day']
X = df[features]
y = df['price']
print(X)

      company_encoded  year  month  day
0                  11  2023     10   22
1                  11  2023     10   23
2                  11  2023     10   24
3                  11  2023     10   25
4                  11  2023     10   26
...               ...   ...    ...  ...
3223                6  2024     10   14
3224                6  2024     10   15
3225                6  2024     10   16
3226                6  2024     10   17
3227                6  2024     10   20

[3228 rows x 4 columns]


In [10]:
# Split the data into training and testing sets with 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)


Create the Models and train them 

In [26]:
# Create and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=1000, random_state=42)
rf_model.fit(X_train, y_train)

# Create and train the linear Regressor model
LR_model = LinearRegression()
LR_model.fit( X_train, y_train )

# Create and train the decission tree Regressor model
DS_model = DecisionTreeRegressor(random_state = 0)   
DS_model.fit(X_train, y_train) 


Test the models and compare the results

In [27]:
# Make predictions on the test set for Random Forest Regressor
RF_y_pred = rf_model.predict(X_test)

# Make predictions on the test set for linear Regressor model
LR_y_pred = LR_model.predict(X_test)

# Make predictions on the test set for decission tree Regressor
DS_y_pred = DS_model.predict(X_test)

In [28]:
# Evaluate the Random Forest model
mse_RF = mean_squared_error(y_test, RF_y_pred)
r2_RF = r2_score(y_test, RF_y_pred)

# Evaluate the linear Regressor model
mse_LR = mean_squared_error(y_test, LR_y_pred)
r2_LR = r2_score(y_test, LR_y_pred)

# Evaluate the decission tree Regressor
mse_DS = mean_squared_error(y_test, DS_y_pred)
r2_DS = r2_score(y_test, DS_y_pred)

In [29]:
#Print the results for mean square error and Square root for all models
print(f"Mean Squared Error for Random Forest: {mse_RF}")
print(f"R-squared Score for Random Forest: {r2_RF}")
print(f"Mean Squared Error for linear Regressor: {mse_LR}")
print(f"R-squared Score for linear Regressor: {r2_LR}")
print(f"Mean Squared Error for decission tree Regressor: {mse_DS}")
print(f"R-squared Score for decission tree Regressor: {r2_DS}")

Mean Squared Error for Random Forest: 0.5704197098627883
R-squared Score for Random Forest: 0.9990793298521712
Mean Squared Error for linear Regressor: 612.5721541978577
R-squared Score for linear Regressor: 0.011294865850953295
Mean Squared Error for decission tree Regressor: 0.6228578463622291
R-squared Score for decission tree Regressor: 0.9989946935290427
