In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Data Loading

In [2]:
FILE_PATH = r"../../data/raw/btc_daily_ohlcv.csv"

print("Loading data from :", FILE_PATH)

try:
    df = pd.read_csv(FILE_PATH)

    print("Data loaded successfully")

except FileNotFoundError:
    print(f"Error: {FILE_PATH} not found. Please ensure the file exists.")
    df = None 

Loading data from : ../../data/raw/btc_daily_ohlcv.csv
Data loaded successfully


### DATA PROFILLING

In [3]:
print("Data Profile")

print("data shape:", df.shape)

print("Columns names:", df.columns)

Data Profile
data shape: (4017, 8)
Columns names: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits'],
      dtype='object')


*   **Date**: This column represents the specific calendar day to which the data in that row pertains. In financial data, it's crucial for time-series analysis.
*   **Open**: This is the price of the asset (Bitcoin, in this case) at the very **beginning** of the trading period (which is a day for daily data). It's the first traded price of that day.
*   **High**: This is the **highest price** that the asset reached at any point during that specific trading day.
*   **Low**: This is the **lowest price** that the asset reached at any point during that specific trading day.
*   **Close**: This is the price of the asset at the **end** of the trading period. It's often considered the most important price of the day, as it reflects the final market sentiment and value at the close of trading.
*   **Volume**: This indicates the **total number of units** of the asset (e.g., Bitcoins) that were traded during that specific day. Higher volume often suggests stronger market interest or conviction behind price movements.
*   **Dividends**: For cryptocurrencies like Bitcoin, this column will almost always be **zero**. Dividends are distributions of a company's profits to its shareholders, which is not applicable to how cryptocurrencies operate.
*   **Stock Splits**: Similar to dividends, this column will also almost always be **zero** for cryptocurrencies. A stock split is an action by a company to increase the number of its outstanding shares, which is a corporate action not relevant to Bitcoin.

In [4]:
print("\n--- Initial Data Inspection ---")

print("\nFirst 5 rows:")
print(df.head())


print("\nLast 5 rows:")
print(df.tail())




--- Initial Data Inspection ---

First 5 rows:
                        Date        Open        High         Low       Close  \
0  2014-09-17 00:00:00+00:00  465.864014  468.174011  452.421997  457.334015   
1  2014-09-18 00:00:00+00:00  456.859985  456.859985  413.104004  424.440002   
2  2014-09-19 00:00:00+00:00  424.102997  427.834991  384.532013  394.795990   
3  2014-09-20 00:00:00+00:00  394.673004  423.295990  389.882996  408.903992   
4  2014-09-21 00:00:00+00:00  408.084991  412.425995  393.181000  398.821014   

     Volume  Dividends  Stock Splits  
0  21056800        0.0           0.0  
1  34483200        0.0           0.0  
2  37919700        0.0           0.0  
3  36863600        0.0           0.0  
4  26580100        0.0           0.0  

Last 5 rows:
                           Date           Open           High            Low  \
4012  2025-09-11 00:00:00+00:00  113961.429688  115522.546875  113453.835938   
4013  2025-09-12 00:00:00+00:00  115507.789062  116769.382812  

In [5]:
print("\nDataFrame Info:")
df.info()

print("\nDescriptive Statistics:")
print(df.describe())


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4017 entries, 0 to 4016
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          4017 non-null   object 
 1   Open          4017 non-null   float64
 2   High          4017 non-null   float64
 3   Low           4017 non-null   float64
 4   Close         4017 non-null   float64
 5   Volume        4017 non-null   int64  
 6   Dividends     4017 non-null   float64
 7   Stock Splits  4017 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 251.2+ KB

Descriptive Statistics:
                Open           High            Low          Close  \
count    4017.000000    4017.000000    4017.000000    4017.000000   
mean    24826.040893   25339.492531   24291.176263   24853.617069   
std     29483.447014   30010.084760   28941.810973   29514.768075   
min       176.897003     211.731003     171.509995     178.102997   
25%      2493

In [6]:
df[["Dividends", "Stock Splits"]].describe()

Unnamed: 0,Dividends,Stock Splits
count,4017.0,4017.0
mean,0.0,0.0
std,0.0,0.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,0.0,0.0


### Checking For null values

In [7]:
missing_values = df.isnull().sum()

print("Missing Values\n", missing_values)

Missing Values
 Date            0
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
dtype: int64


In [8]:
# remove extra space from columns name
columns_name = [col.strip() for col in df.columns]

# reassign
df.columns = columns_name

df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits'],
      dtype='object')