## Problem Statement

      
   1) The goal of this project is to compare and evaluate three different AI models for Bitcoin price prediction using distinct algorithms.
   
   2) By training and testing these models on historical Bitcoin price data, we aim to assess their performance and identify the most effective model for predicting future Bitcoin prices.
   
   3) The recommended model will be selected based on its accuracy, robustness, and ability to capture the complex patterns and dynamics of Bitcoin price movements.


## Step 1: Data Preprocessing

1)  Import the necessary libraries

2)  Load the Bitcoin price data into a pandas DataFrame. Assuming you have a CSV file named "bitcoin_prices.csv" containing the     price data with columns "Date" and "Price"

3)  Preprocess the data by performing any necessary transformations. 

4)  Perform additional data transformations as needed, such as feature scaling or encoding

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from yahoofinancials import YahooFinancials
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
import pickle
from datetime import datetime

In [2]:
tickerSymbol = 'BTC-USD'
tickerData = yf.Ticker(tickerSymbol)
current_date = datetime.today().strftime('%Y-%m-%d')
tickerDf = tickerData.history(period='1d', start='2010-1-1', end=current_date)
df1 = pd.DataFrame(tickerDf)
df1.reset_index(inplace=True)
df1.to_csv('output1.csv')
df1

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2014-09-17 00:00:00+00:00,465.864014,468.174011,452.421997,457.334015,21056800,0.0,0.0
1,2014-09-18 00:00:00+00:00,456.859985,456.859985,413.104004,424.440002,34483200,0.0,0.0
2,2014-09-19 00:00:00+00:00,424.102997,427.834991,384.532013,394.795990,37919700,0.0,0.0
3,2014-09-20 00:00:00+00:00,394.673004,423.295990,389.882996,408.903992,36863600,0.0,0.0
4,2014-09-21 00:00:00+00:00,408.084991,412.425995,393.181000,398.821014,26580100,0.0,0.0
...,...,...,...,...,...,...,...,...
3198,2023-06-20 00:00:00+00:00,26841.664062,28388.968750,26668.791016,28327.488281,22211859147,0.0,0.0
3199,2023-06-21 00:00:00+00:00,28311.310547,30737.330078,28283.410156,30027.296875,33346760979,0.0,0.0
3200,2023-06-22 00:00:00+00:00,29995.935547,30495.998047,29679.158203,29912.281250,20653160491,0.0,0.0
3201,2023-06-23 00:00:00+00:00,29896.382812,31389.539062,29845.214844,30695.468750,24115570085,0.0,0.0


### Read from csv

In [4]:
df1 = pd.read_csv("output1.csv")
df1=pd.DataFrame(df1)
df1 = df1.drop('Unnamed: 0', axis=1)

df1


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2014-09-17 00:00:00+00:00,465.864014,468.174011,452.421997,457.334015,21056800,0.0,0.0
1,2014-09-18 00:00:00+00:00,456.859985,456.859985,413.104004,424.440002,34483200,0.0,0.0
2,2014-09-19 00:00:00+00:00,424.102997,427.834991,384.532013,394.795990,37919700,0.0,0.0
3,2014-09-20 00:00:00+00:00,394.673004,423.295990,389.882996,408.903992,36863600,0.0,0.0
4,2014-09-21 00:00:00+00:00,408.084991,412.425995,393.181000,398.821014,26580100,0.0,0.0
...,...,...,...,...,...,...,...,...
3198,2023-06-20 00:00:00+00:00,26841.664062,28388.968750,26668.791016,28327.488281,22211859147,0.0,0.0
3199,2023-06-21 00:00:00+00:00,28311.310547,30737.330078,28283.410156,30027.296875,33346760979,0.0,0.0
3200,2023-06-22 00:00:00+00:00,29995.935547,30495.998047,29679.158203,29912.281250,20653160491,0.0,0.0
3201,2023-06-23 00:00:00+00:00,29896.382812,31389.539062,29845.214844,30695.468750,24115570085,0.0,0.0


In [5]:
def clean_data(df):
    df = df.drop(['Dividends', 'Stock Splits'], axis=1)
    df = df.dropna()
    df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')
    df['Date'] = pd.to_datetime(df['Date'])
    df['date'] = (df['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

#     df = df.drop('Date',axis=1 )
    return df

df1 = clean_data(df1)

print(df1)

           Date          Open          High           Low         Close   
0    2014-09-17    465.864014    468.174011    452.421997    457.334015  \
1    2014-09-18    456.859985    456.859985    413.104004    424.440002   
2    2014-09-19    424.102997    427.834991    384.532013    394.795990   
3    2014-09-20    394.673004    423.295990    389.882996    408.903992   
4    2014-09-21    408.084991    412.425995    393.181000    398.821014   
...         ...           ...           ...           ...           ...   
3198 2023-06-20  26841.664062  28388.968750  26668.791016  28327.488281   
3199 2023-06-21  28311.310547  30737.330078  28283.410156  30027.296875   
3200 2023-06-22  29995.935547  30495.998047  29679.158203  29912.281250   
3201 2023-06-23  29896.382812  31389.539062  29845.214844  30695.468750   
3202 2023-06-24  30708.738281  30804.148438  30290.146484  30548.695312   

           Volume        date  
0        21056800  1410912000  
1        34483200  1410998400  
2  

In [226]:


df1.sort_values(by="Date", inplace=True)

f = ['Open', 'High', 'Low', 'Close', 'Volume']
# Perform feature scaling using MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df1[f])

# Create a new DataFrame with scaled data
sdf = pd.DataFrame(scaled_data, columns=f, index=df1.index)
sdf['date']=df1['date']
sdf['Date1'] = df1['Date']

sdf
sdf.to_csv('sdf.csv')

In [227]:
sdf.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'date', 'Date1'], dtype='object')

##  Step 2: Build the Linear Regression Model

In [204]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score

In [205]:
sdf

Unnamed: 0,Open,High,Low,Close,Volume,date,Date1
0,0.004289,0.003739,0.004243,0.004144,0.000043,1410912000,2014-09-17
1,0.004155,0.003574,0.003649,0.003655,0.000081,1410998400,2014-09-18
2,0.003669,0.003151,0.003217,0.003216,0.000091,1411084800,2014-09-19
3,0.003232,0.003085,0.003298,0.003425,0.000088,1411171200,2014-09-20
4,0.003431,0.002927,0.003348,0.003275,0.000059,1411257600,2014-09-21
...,...,...,...,...,...,...,...
3157,0.407846,0.409913,0.403443,0.407244,0.058839,1683676800,2023-05-10
3158,0.407348,0.399695,0.401904,0.398029,0.047636,1683763200,2023-05-11
3159,0.397946,0.391437,0.388260,0.395124,0.055014,1683849600,2023-05-12
3160,0.395276,0.391070,0.400833,0.394813,0.028474,1683936000,2023-05-13


#### Split the data into input features (X) and target variable (y)

In [206]:
X = sdf.drop(["Close",'Date1'], axis=1)
y = sdf["Close"]

X

Unnamed: 0,Open,High,Low,Volume,date
0,0.004289,0.003739,0.004243,0.000043,1410912000
1,0.004155,0.003574,0.003649,0.000081,1410998400
2,0.003669,0.003151,0.003217,0.000091,1411084800
3,0.003232,0.003085,0.003298,0.000088,1411171200
4,0.003431,0.002927,0.003348,0.000059,1411257600
...,...,...,...,...,...
3157,0.407846,0.409913,0.403443,0.058839,1683676800
3158,0.407348,0.399695,0.401904,0.047636,1683763200
3159,0.397946,0.391437,0.388260,0.055014,1683849600
3160,0.395276,0.391070,0.400833,0.028474,1683936000


In [207]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 3: Build the Model

In [208]:
model = LinearRegression()
model.fit(X_train, y_train)

In [105]:
with open("modeltest.pkl", "wb") as f:
    pickle.dump(model, f)

In [209]:
X_test

Unnamed: 0,Open,High,Low,Volume,date
139,0.000899,0.000499,0.000800,0.000099,1422921600
864,0.011027,0.010385,0.011302,0.000180,1485561600
291,0.001245,0.000915,0.001317,0.000109,1436054400
1242,0.126805,0.129937,0.122699,0.022154,1518220800
599,0.004179,0.003612,0.004296,0.000098,1462665600
...,...,...,...,...,...
2590,0.951536,0.972889,0.958143,0.116204,1634688000
1192,0.233345,0.229399,0.176127,0.063232,1513900800
2748,0.657957,0.679750,0.668561,0.080222,1648339200
2525,0.695281,0.696819,0.687637,0.093375,1629072000


## Step 4: Make Predictions

In [210]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)


Mean Squared Error: 1.9840388624600917e-05
R-squared Score: 0.9996422826059349


In [115]:
sdf

Unnamed: 0,date,Open,High,Low,Close,Volume,Date1
0,0.000000,0.004289,0.003739,0.004243,0.004144,0.000043,2014-09-17
1,0.000316,0.004155,0.003574,0.003649,0.003655,0.000081,2014-09-18
2,0.000633,0.003669,0.003151,0.003217,0.003216,0.000091,2014-09-19
3,0.000949,0.003232,0.003085,0.003298,0.003425,0.000088,2014-09-20
4,0.001265,0.003431,0.002927,0.003348,0.003275,0.000059,2014-09-21
...,...,...,...,...,...,...,...
3157,0.998735,0.407846,0.409913,0.403443,0.407244,0.058839,2023-05-10
3158,0.999051,0.407348,0.399695,0.401904,0.398029,0.047636,2023-05-11
3159,0.999367,0.397946,0.391437,0.388260,0.395124,0.055014,2023-05-12
3160,0.999684,0.395276,0.391070,0.400833,0.394813,0.028474,2023-05-13


In [155]:
d= pd.DataFrame({
    'Date': ['2014-09-17']
})


d['Date'] = pd.to_datetime(d['Date']).dt.strftime('%Y-%m-%d')
d['Date'] = pd.to_datetime(d['Date'])

# Convert the dates to Unix timestamps
d['date'] = (d['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
d= d.drop('Date', axis=1)

d['Open'] = 0.004289
d['High']=0.003739
d['Low']=0.004243
d['Volume']=0.000043
x = model.predict(d)
min_close = df1["Close"].min()
max_close = df1["Close"].max()
pred = x * (max_close - min_close) + min_close
pred


array([4.27822251e+10])

In [178]:
sdf

Unnamed: 0,date,Open,High,Low,Close,Volume,Date1
0,0.000000,0.004289,0.003739,0.004243,0.004144,0.000043,2014-09-17
1,0.000316,0.004155,0.003574,0.003649,0.003655,0.000081,2014-09-18
2,0.000633,0.003669,0.003151,0.003217,0.003216,0.000091,2014-09-19
3,0.000949,0.003232,0.003085,0.003298,0.003425,0.000088,2014-09-20
4,0.001265,0.003431,0.002927,0.003348,0.003275,0.000059,2014-09-21
...,...,...,...,...,...,...,...
3157,0.998735,0.407846,0.409913,0.403443,0.407244,0.058839,2023-05-10
3158,0.999051,0.407348,0.399695,0.401904,0.398029,0.047636,2023-05-11
3159,0.999367,0.397946,0.391437,0.388260,0.395124,0.055014,2023-05-12
3160,0.999684,0.395276,0.391070,0.400833,0.394813,0.028474,2023-05-13


In [211]:
Date = pd.to_datetime("2024-05-14")
print(Date)
same_month_dates = sdf[sdf['Date1'].dt.month == Date.month]['Date1']
previous_dates = same_month_dates[same_month_dates < Date]
previous_date = previous_dates.max()
predata = sdf[(sdf['Date1'] == previous_date)].reset_index(drop=True)
print(predata)

# Create a new DataFrame 'h' with the target date and the previous date's data
h = pd.DataFrame()
h['Open'] = predata['Open']
h['High'] = predata['High']
h['Low'] = predata['Low']
h['Volume'] = predata['Volume']
h.loc[0, 'Date'] = Date

h['Date'] = pd.to_datetime(h['Date']).dt.strftime('%Y-%m-%d')
h['Date'] = pd.to_datetime(h['Date'])

# Convert the dates to Unix timestamps
h['date'] = (h['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')


# Drop the original 'Date' and 'DayOfYear' columns
# h.drop(['Date', 'DayOfYear'], axis=1, inplace=True)


h





2024-05-14 00:00:00
       Open      High       Low     Close    Volume        date      Date1
0  0.394997  0.392827  0.400085  0.396988  0.028519  1684022400 2023-05-14


Unnamed: 0,Open,High,Low,Volume,Date,date
0,0.394997,0.392827,0.400085,0.028519,2024-05-14,1715644800


In [212]:
prediction = model.predict(h[['Open', 'High', 'Low', 'Volume', 'date']])
prediction

array([0.39694426])

In [213]:
min_close = df1["Close"].min()
max_close = df1["Close"].max()
pred = prediction * (max_close - min_close) + min_close
pred


array([26927.67051051])

In [108]:
df["Close"].max()

67566.828125

In [214]:
min_close = df1["Close"].min()
max_close = df1["Close"].max()
pre= model.predict(X)
pred = pre * (max_close - min_close) + min_close
df["Predicted_Close"] = pred
print(df[["Close", "Predicted_Close"]])


             Close  Predicted_Close
0       457.334015       450.793945
1       424.440002       420.446273
2       394.795990       393.903011
3       408.903992       409.308993
4       398.821014       394.244749
...            ...              ...
3157  27621.755859     27649.735518
3158  27000.789062     26970.037639
3159  26804.990234     26227.842734
3160  26784.078125     26836.320672
3161  26930.638672     26924.159795

[3162 rows x 2 columns]


In [215]:
y_pred_train = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

y_pred_test = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print("Train Mean Squared Error:", mse_train)
print("Train R-squared Score:", r2_train)
print("Test Mean Squared Error:", mse_test)
print("Test R-squared Score:", r2_test)


Train Mean Squared Error: 2.4649962348429817e-05
Train R-squared Score: 0.99956670662197
Test Mean Squared Error: 1.9840388624600917e-05
Test R-squared Score: 0.9996422826059349


## Model 2 - Decision Tree Regressor

## Import the required libraries

In [239]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

## Step 1: Data Preprocessing

In [240]:
btc_data = pd.read_csv('sdf.csv')


# Preprocess the data
X = btc_data.drop(["Close",'Date1'], axis=1)  # Input features
y = btc_data['Close']  # Target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 2: Build the Model

In [241]:
model2 = DecisionTreeRegressor()
model2.fit(X_train, y_train)

## Step 3: Train the Model

In [242]:
y_pred = model2.predict(X_test)

## Step 4: Make Predictions

In [243]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error: {rmse}")
r2 = r2_score(y_test, y_pred)
print("R-squared Score:", r2)

Root Mean Squared Error: 0.008999536690497694
R-squared Score: 0.9985397400053689


## Model 2- Random Forest Regressor

## Import the required libraries

In [225]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


## Step 1: Data Preprocessing

In [228]:
df = pd.read_csv('sdf.csv')

# # Preprocess the data
# df["Date"] = pd.to_datetime(df["Date"])
# df.set_index("Date", inplace=True)

# Split the data into training and testing sets
train_size = int(len(df) * 0.8)
train_data = df[:train_size]
test_data = df[train_size:]

In [229]:
sdf

Unnamed: 0,Open,High,Low,Close,Volume,date,Date1
0,0.004289,0.003739,0.004243,0.004144,0.000043,1410912000,2014-09-17
1,0.004155,0.003574,0.003649,0.003655,0.000081,1410998400,2014-09-18
2,0.003669,0.003151,0.003217,0.003216,0.000091,1411084800,2014-09-19
3,0.003232,0.003085,0.003298,0.003425,0.000088,1411171200,2014-09-20
4,0.003431,0.002927,0.003348,0.003275,0.000059,1411257600,2014-09-21
...,...,...,...,...,...,...,...
3157,0.407846,0.409913,0.403443,0.407244,0.058839,1683676800,2023-05-10
3158,0.407348,0.399695,0.401904,0.398029,0.047636,1683763200,2023-05-11
3159,0.397946,0.391437,0.388260,0.395124,0.055014,1683849600,2023-05-12
3160,0.395276,0.391070,0.400833,0.394813,0.028474,1683936000,2023-05-13


In [233]:
X_train, y_train = train_data.drop(["Close",'Date1'], axis=1), train_data["Close"]
X_test, y_test = test_data.drop(["Close",'Date1'], axis=1), test_data["Close"]

## Step 2: Build the Model


In [234]:
model1 = RandomForestRegressor(n_estimators=100, random_state=42)

model1.fit(X_train, y_train)


## Step 3: Train the Model


In [235]:
y_pred = model1.predict(X_test)

## Step 4: Make Predictions

In [236]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error: {rmse}")
r2 = r2_score(y_test, y_pred)
print("R-squared Score:", r2)

Root Mean Squared Error: 0.03450308870056785
R-squared Score: 0.9707809597720849


In [237]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
btc_data = pd.read_csv('sdf.csv')


# Preprocess the data
X = btc_data.drop(["Close",'Date1'], axis=1)  # Input features
y = btc_data['Close']  # Target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model 1: Linear Regression
model1 = LinearRegression()
model1.fit(X_train_scaled, y_train)

# Model 2: Decision Tree Regression
model2 = DecisionTreeRegressor()
model2.fit(X_train_scaled, y_train)

# Model 3: Random Forest Regression
model3 = RandomForestRegressor()
model3.fit(X_train_scaled, y_train)

# Function to evaluate a model's performance
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mae, mse, r2

# Evaluate the models
mae1, mse1, r2_1 = evaluate_model(model1, X_test_scaled, y_test)
mae2, mse2, r2_2 = evaluate_model(model2, X_test_scaled, y_test)
mae3, mse3, r2_3 = evaluate_model(model3, X_test_scaled, y_test)

# Print evaluation results
print("Model 1 - Linear Regression:")
print("MAE:", mae1)
print("MSE:", mse1)
print("R2 Score:", r2_1)
print()
print("Model 2 - Decision Tree Regression:")
print("MAE:", mae2)
print("MSE:", mse2)
print("R2 Score:", r2_2)
print()
print("Model 3 - Random Forest Regression:")
print("MAE:", mae3)
print("MSE:", mse3)
print("R2 Score:", r2_3)

# Make a final model recommendation based on the evaluation results
if r2_1 > r2_2 and r2_1 > r2_3:
    final_model = model1
    print("Final Model Recommendation: Linear Regression")
elif r2_2 > r2_1 and r2_2 > r2_3:
    final_model = model2
    print("Final Model Recommendation: Decision Tree Regression")
else:
    final_model = model3
    print("Final Model Recommendation: Random Forest Regression")


Model 1 - Linear Regression:
MAE: 0.0021284990625795816
MSE: 1.9853710397420103e-05
R2 Score: 0.9996420424176026

Model 2 - Decision Tree Regression:
MAE: 0.0038997971566094103
MSE: 7.76165719213627e-05
R2 Score: 0.9986005920363098

Model 3 - Random Forest Regression:
MAE: 0.0031102557035612983
MSE: 4.232434120035859e-05
R2 Score: 0.9992369023951002
Final Model Recommendation: Linear Regression
