<a href="https://colab.research.google.com/github/raphaelassoun23/Projet-Python/blob/main/notebooks/MonteCarloCrypto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#-----Installation des packages et importation des modules------
!pip install yfinance pandas

import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime




In [6]:
#-----Importation des données via API YahooFinance------

def get_crypto_yf(ticker, crypto_name):

    df_init = yf.download(ticker, period="max")
    df_init = df_init.reset_index()  # remettre la date comme colonne

    df_init.columns = [col[0] if isinstance(col, tuple) else col for col in df_init.columns]

    # Renommer les colonnes pour avoir un dataset propre
    df_init = df_init.rename(columns={
        "Date": "timestamp",
        "Open": f"{crypto_name}_open",
        "High": f"{crypto_name}_high",
        "Low": f"{crypto_name}_low",
        "Close": f"{crypto_name}_price",
        "Volume": f"{crypto_name}_volume"
    })

    # On garde uniquement les colonnes utiles
    df_init = df_init[["timestamp",
             f"{crypto_name}_price",
             f"{crypto_name}_volume",
             f"{crypto_name}_open",
             f"{crypto_name}_high",
             f"{crypto_name}_low"]]

    return df_init

# Récupération BTC & ETH

btc_df = get_crypto_yf("BTC-USD", "BTC")
eth_df = get_crypto_yf("ETH-USD", "ETH")

# Fusionner les deux cryptos dans un dataset sur la base temporelle commune

crypto_df = pd.merge(btc_df, eth_df, on='timestamp', how='inner')

#Taille du dataset
print(eth_df.shape)
print(eth_df.shape)
print(crypto_df.shape)

#Premiere et dernière date de chaque datset, bitcoin commence en 2014 et eth en 2017 > on reg a partir de 2017
print(btc_df['timestamp'].min(), btc_df['timestamp'].max())
print(eth_df['timestamp'].min(), eth_df['timestamp'].max())
print(crypto_df['timestamp'].min(), crypto_df['timestamp'].max())

#Afficher le dataset
print(crypto_df.head())


  df_init = yf.download(ticker, period="max")
[*********************100%***********************]  1 of 1 completed
  df_init = yf.download(ticker, period="max")
[*********************100%***********************]  1 of 1 completed

(2951, 6)
(2951, 6)
(2951, 11)
2014-09-17 00:00:00 2025-12-07 00:00:00
2017-11-09 00:00:00 2025-12-07 00:00:00
2017-11-09 00:00:00 2025-12-07 00:00:00
   timestamp    BTC_price  BTC_volume     BTC_open     BTC_high      BTC_low  \
0 2017-11-09  7143.580078  3226249984  7446.830078  7446.830078  7101.520020   
1 2017-11-10  6618.140137  5208249856  7173.729980  7312.000000  6436.870117   
2 2017-11-11  6357.600098  4908680192  6618.609863  6873.149902  6204.220215   
3 2017-11-12  5950.069824  8957349888  6295.450195  6625.049805  5519.009766   
4 2017-11-13  6559.490234  6263249920  5938.250000  6811.189941  5844.290039   

    ETH_price  ETH_volume    ETH_open    ETH_high     ETH_low  
0  320.884003   893249984  308.644989  329.451996  307.056000  
1  299.252991   885985984  320.670990  324.717987  294.541992  
2  314.681000   842300992  298.585999  319.453003  298.191986  
3  307.907990  1613479936  314.690002  319.153015  298.513000  
4  316.716003  1041889984  307.024994  328.41500




In [7]:
#------Nettoyage des données------

df = crypto_df.copy()

# Vérification et suppression des doublons
print("Doublons avant nettoyage :", df.duplicated(subset='timestamp').sum())
df = df.drop_duplicates(subset='timestamp')

# Vérification des valeurs manquantes
print("Valeurs manquantes avant interpolation :\n", df.isna().sum())

# Choix de l'interpolation linéaire pour prix et volume (et OHLC) (ici pas vrmt besoin car 0 valeurs manquantes)
cols_to_interpolate = [
    'BTC_price', 'BTC_volume', 'BTC_open', 'BTC_high', 'BTC_low',
    'ETH_price', 'ETH_volume', 'ETH_open', 'ETH_high', 'ETH_low'
]
df[cols_to_interpolate] = df[cols_to_interpolate].interpolate(method='linear')

# Conversion du timestamp en datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Vérification finale
print("Valeurs manquantes après interpolation :\n", df.isna().sum())
print(df.info())
print(df.head())

Doublons avant nettoyage : 0
Valeurs manquantes avant interpolation :
 timestamp     0
BTC_price     0
BTC_volume    0
BTC_open      0
BTC_high      0
BTC_low       0
ETH_price     0
ETH_volume    0
ETH_open      0
ETH_high      0
ETH_low       0
dtype: int64
Valeurs manquantes après interpolation :
 timestamp     0
BTC_price     0
BTC_volume    0
BTC_open      0
BTC_high      0
BTC_low       0
ETH_price     0
ETH_volume    0
ETH_open      0
ETH_high      0
ETH_low       0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2951 entries, 0 to 2950
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   timestamp   2951 non-null   datetime64[ns]
 1   BTC_price   2951 non-null   float64       
 2   BTC_volume  2951 non-null   int64         
 3   BTC_open    2951 non-null   float64       
 4   BTC_high    2951 non-null   float64       
 5   BTC_low     2951 non-null   float64       
 6   ETH_price   2

In [8]:
#-----Création de nouvelles varaibles pour enrichir notre dataset et répondre à la problématique-----


cryptos = ['BTC', 'ETH']

for crypto in cryptos:
    # Rendement journalier logarithmique
    df[f'{crypto}_return_daily'] = np.log(df[f'{crypto}_price'] / df[f'{crypto}_price'].shift(1))

    # Volatilité rolling 7 jours
    df[f'{crypto}_volatility_7d'] = df[f'{crypto}_return_daily'].rolling(window=7).std()

    # Volatilité rolling 30 jours
    df[f'{crypto}_volatility_30d'] = df[f'{crypto}_return_daily'].rolling(window=30).std()

    # Moyenne mobile 7 jours
    df[f'{crypto}_moving_avg_7d'] = df[f'{crypto}_price'].rolling(window=7).mean()

    # Moyenne mobile 30 jours
    df[f'{crypto}_moving_avg_30d'] = df[f'{crypto}_price'].rolling(window=30).mean()

    # Range journalier (High - Low)
    df[f'{crypto}_range_daily'] = df[f'{crypto}_high'] - df[f'{crypto}_low']

    # Variation journalière du volume
    df[f'{crypto}_volume_change'] = df[f'{crypto}_volume'].pct_change()


# Vérification des valeurs manquantes
print("Valeurs manquantes sur df:\n", df.isna().sum())

# Gérer tous les NaN du nouveau dataset
df_final = df.dropna().reset_index(drop=True)

# Vérification rapide des infos du dataset
print("Valeurs manquantes sur df_final :\n", df_final.isna().sum())
print(df_final.info())
df_final

Valeurs manquantes sur df:
 timestamp              0
BTC_price              0
BTC_volume             0
BTC_open               0
BTC_high               0
BTC_low                0
ETH_price              0
ETH_volume             0
ETH_open               0
ETH_high               0
ETH_low                0
BTC_return_daily       1
BTC_volatility_7d      7
BTC_volatility_30d    30
BTC_moving_avg_7d      6
BTC_moving_avg_30d    29
BTC_range_daily        0
BTC_volume_change      1
ETH_return_daily       1
ETH_volatility_7d      7
ETH_volatility_30d    30
ETH_moving_avg_7d      6
ETH_moving_avg_30d    29
ETH_range_daily        0
ETH_volume_change      1
dtype: int64
Valeurs manquantes sur df_final :
 timestamp             0
BTC_price             0
BTC_volume            0
BTC_open              0
BTC_high              0
BTC_low               0
ETH_price             0
ETH_volume            0
ETH_open              0
ETH_high              0
ETH_low               0
BTC_return_daily      0
BTC_volatil

Unnamed: 0,timestamp,BTC_price,BTC_volume,BTC_open,BTC_high,BTC_low,ETH_price,ETH_volume,ETH_open,ETH_high,...,BTC_moving_avg_30d,BTC_range_daily,BTC_volume_change,ETH_return_daily,ETH_volatility_7d,ETH_volatility_30d,ETH_moving_avg_7d,ETH_moving_avg_30d,ETH_range_daily,ETH_volume_change
0,2017-12-09,15178.200195,13911300096,16523.300781,16783.000000,13674.900391,473.502014,2003849984,457.343994,504.147003,...,9688.958301,3108.099609,-0.341820,0.037595,0.041395,0.048168,455.981005,405.169270,47.894012,-0.142327
1,2017-12-10,15455.400391,13433299968,15168.400391,15850.599609,13226.599609,441.721008,1404179968,472.789001,472.789001,...,9983.533643,2624.000000,-0.034361,-0.069478,0.049570,0.048150,452.533578,409.918204,43.274994,-0.299259
2,2017-12-11,16936.800781,12153900032,15427.400391,17513.900391,15404.799805,515.135986,1771440000,440.358002,516.968994,...,10336.173665,2109.100586,-0.095241,0.153752,0.079068,0.054236,458.952432,416.600037,77.864990,0.261548
3,2017-12-12,17415.400391,14603799552,16919.800781,17781.800781,16571.599609,651.431030,5179829760,522.286011,657.317993,...,10718.351351,1210.201172,0.201573,0.234741,0.113276,0.066776,485.831007,428.050805,152.824005,1.924079
4,2017-12-13,16408.199219,12976900096,17500.000000,17653.099609,16039.700195,702.767029,4524539904,644.906006,747.992981,...,11046.641650,1613.399414,-0.111402,0.075854,0.098609,0.067419,524.999438,440.919172,150.195007,-0.126508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2916,2025-12-03,93527.804688,77650204986,91345.093750,94060.773438,91056.390625,3191.571777,29949301036,2997.801514,3212.559814,...,94403.412500,3004.382812,-0.011415,0.062588,0.046202,0.039833,3002.906424,3124.577173,224.417725,0.126183
2917,2025-12-04,92141.625000,64538402681,93454.257812,94038.242188,90976.101562,3134.316406,27434991113,3188.343506,3238.555420,...,94088.449219,3062.140625,-0.168857,-0.018102,0.047074,0.036514,3020.017020,3119.301904,167.245361,-0.083952
2918,2025-12-05,89387.757812,63256398633,92133.648438,92702.640625,88152.140625,3024.432861,28000268228,3134.357422,3192.457031,...,93604.979948,4550.500000,-0.019864,-0.035688,0.049583,0.036172,3018.892508,3105.943962,202.625488,0.020604
2919,2025-12-06,89272.375000,37994042405,89389.359375,90267.460938,88951.664062,3040.207764,10962819760,3024.487549,3067.661377,...,93204.016146,1315.796875,-0.399364,0.005202,0.049261,0.035777,3025.824289,3096.875627,53.679443,-0.608474


Les valeurs manquantes présentes dans le dataset proviennent uniquement des transformations réalisées (retours, volatilités, moyennes mobiles). Ces NaN correspondent aux premières observations où les fenêtres de calcul ne sont pas encore complètes.
Ils ne représentent donc pas de véritables trous dans les données.
Leur suppression est justifiée, car elle ne retire qu’un très faible nombre d’observations et évite d’introduire un biais par une imputation artificielle
(par la moyenne,par la médiane ou par régression)