In [71]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime, timedelta
import requests
from pycoingecko import CoinGeckoAPI
import time



In [72]:
from pycoingecko import CoinGeckoAPI
cg = CoinGeckoAPI()
print("Coins",cg)

Coins <pycoingecko.api.CoinGeckoAPI object at 0x3140c4410>


In [73]:
# Get top 5 coins
top_coins = cg.get_coins_markets(vs_currency='usd', order='market_cap_desc', per_page=5, page=1)
# print("Top_coins",top_coins)
##extract top 5 coins ids
top_coins_id=[coin['id'] for coin in top_coins]
print("top coins ID",top_coins_id)

top coins ID ['bitcoin', 'ethereum', 'ripple', 'tether', 'binancecoin']


In [74]:
##function to  extract data of bitcoins
def fetch_ohlcv(coin_id,days=90):
    data=cg.data = cg.get_coin_market_chart_by_id(
        id=coin_id, 
        vs_currency='usd', 
        days=days, 
        interval='daily'
    )
    prices=data['prices']
    df = pd.DataFrame(prices, columns=['timestamp', 'price'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df.set_index('timestamp', inplace=True)
    return df
    

In [75]:
#loop through coins
ohlcv_data={}
for coin_id in top_coins_id:
    df=fetch_ohlcv(coin_id)
    ohlcv_data[coin_id]=df
    time.sleep(1.2)


    

In [76]:
ohlcv_data['bitcoin'].head()

Unnamed: 0_level_0,price
timestamp,Unnamed: 1_level_1
2025-04-14,83600.820101
2025-04-15,84523.452491
2025-04-16,83656.492489
2025-04-17,84105.779422
2025-04-18,84930.908576


In [77]:
##feature engineering 
import ta
def add_price_features(df):
    df=df.copy()
    df['returns']=df['price'].pct_change()
    ##moving averages
    df['ma7']=df['price'].rolling(window=7).mean()
    df['ma14']=df['price'].rolling(window=14).mean()
    #volatility (3 day rolling std deviation of returns)
    df['volatility']=df['returns'].rolling(window=3).std()
     # RSI using `ta` lib (relative strength index)
    df['rsi'] = ta.momentum.RSIIndicator(close=df['price'], window=14).rsi()

    # Drop missing values after rolling windows
    df = df.dropna()

    return df



In [78]:
##apply to all coins in ohlcv_data
price_feature_data={}
for coin_id,df in ohlcv_data.items():
    price_feature_data[coin_id]=add_price_features(df)

print("price features",price_feature_data['bitcoin'])

price features                              price   returns            ma7           ma14  \
timestamp                                                                    
2025-04-27 00:00:00   94644.066371 -0.001362   91856.689034   88098.192067   
2025-04-28 00:00:00   93809.337820 -0.008820   93104.713659   88827.371904   
2025-04-29 00:00:00   95030.606455  0.013019   94187.365011   89577.882901   
2025-04-30 00:00:00   94256.359463 -0.008147   94284.535522   90335.016256   
2025-05-01 00:00:00   94235.753310 -0.000219   94374.578522   91058.585820   
...                            ...       ...            ...            ...   
2025-07-09 00:00:00  108953.191877  0.006025  108736.302266  107923.569650   
2025-07-10 00:00:00  111327.530542  0.021792  109093.886025  108215.641086   
2025-07-11 00:00:00  115879.650301  0.040889  109990.663948  108851.043783   
2025-07-12 00:00:00  117571.025100  0.014596  111352.111542  109600.480175   
2025-07-12 11:32:14  118050.024003  0.004074  112

In [79]:
##fetch crypto news data from newsAPI.org
# STEP 3: News Sentiment Feature Engineering
# ==========================================

# 1. Install Required Libraries (if not already)
# !pip install newsapi-python vaderSentiment

from newsapi import NewsApiClient
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from datetime import datetime, timedelta

# 2. Initialize NewsAPI and VADER
newsapi = NewsApiClient(api_key='cc568be6adde437fb3a5ecf239779a87')  # Replace with your key
analyzer = SentimentIntensityAnalyzer()

# 3. Define Function to Fetch and Analyze Sentiment
def fetch_news_sentiment(query, from_date, to_date):
    print(f"📰 Fetching news from {from_date} to {to_date}...")
    try:
        articles = newsapi.get_everything(
            q=query,
            from_param=from_date,
            to=to_date,
            language='en',
            sort_by='relevancy',
            page_size=100,
        )
    except Exception as e:
        print("❌ Error fetching news:", e)
        return pd.DataFrame(columns=['sentiment'])

    scores = []

    for article in articles['articles']:
        title = article['title']
        date = pd.to_datetime(article['publishedAt']).date()
        
        sentiment = analyzer.polarity_scores(title)['compound']
        scores.append((date, sentiment))

    df = pd.DataFrame(scores, columns=['date', 'sentiment'])

    if not df.empty:
        return df.groupby('date').mean()  # Daily average sentiment
    else:
        return pd.DataFrame(columns=['sentiment'])

# 4. Set Date Range (last 90 days)
to_date = datetime.utcnow().date()
from_date = to_date - timedelta(days=25)

# 5. Run Sentiment Fetcher
sentiment_df = fetch_news_sentiment(
    query="crypto OR bitcoin OR ethereum",
    from_date=from_date.isoformat(),
    to_date=to_date.isoformat()
)

# 6. View Sample Output
print("✅ Sample sentiment data:")
print(sentiment_df.tail())



  to_date = datetime.utcnow().date()


📰 Fetching news from 2025-06-17 to 2025-07-12...
✅ Sample sentiment data:
            sentiment
date                 
2025-07-07   0.090050
2025-07-08   0.000450
2025-07-09   0.100657
2025-07-10   0.080380
2025-07-11  -0.121950


In [80]:
def merge_features(price_df,sentiment_df):
    df=price_df.copy()
    df['date']=df.index.date
    merged=df.merge(sentiment_df,left_on='date',right_index=True,how='left')
    merged['sentiment'].fillna(0, inplace=True)
    merged.drop(columns=['date'])
    return merged




#apply to all coins
merged_featured_data={}
for coin_id,df in price_feature_data.items():
     merged_featured_data[coin_id] = merge_features(df, sentiment_df)

merged_featured_data['bitcoin'].tail()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged['sentiment'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged['sentiment'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

Unnamed: 0_level_0,price,returns,ma7,ma14,volatility,rsi,date,sentiment
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-07-09 00:00:00,108953.191877,0.006025,108736.302266,107923.56965,0.009372,56.781571,2025-07-09,0.100657
2025-07-10 00:00:00,111327.530542,0.021792,109093.886025,108215.641086,0.015088,62.786968,2025-07-10,0.08038
2025-07-11 00:00:00,115879.650301,0.040889,109990.663948,108851.043783,0.017459,71.083166,2025-07-11,-0.12195
2025-07-12 00:00:00,117571.0251,0.014596,111352.111542,109600.480175,0.013588,73.45146,2025-07-12,0.0
2025-07-12 11:32:14,118050.024003,0.004074,112756.762329,110366.082927,0.018962,74.098439,2025-07-12,0.0


In [81]:
##add labels to dataframe
def add_volatility_label(df, threshold='median'):
    df = df.copy()

    # Shift volatility to represent "next day volatility"
    df['volatility_next_day'] = df['volatility'].shift(-1)

    # Calculate threshold
    if threshold == 'median':
        vol_threshold = df['volatility_next_day'].median()
    elif isinstance(threshold, float):
        vol_threshold = threshold
    else:
        raise ValueError("Threshold should be 'median' or a float.")

    # Create binary label
    df['high_volatility'] = (df['volatility_next_day'] > vol_threshold).astype(int)

    df.dropna(inplace=True)  # Remove last row (because of shift)
    return df


In [82]:
##apply to all coins
labelled_data={}
for coin_id,df in merged_featured_data.items():
    labelled_data[coin_id]=add_volatility_label(df)
    print(f"✅ Labels added for {coin_id}",labelled_data)

✅ Labels added for bitcoin {'bitcoin':                     price   returns            ma7           ma14  volatility  \
timestamp                                                                       
2025-04-27   94644.066371 -0.001362   91856.689034   88098.192067    0.005524   
2025-04-28   93809.337820 -0.008820   93104.713659   88827.371904    0.009260   
2025-04-29   95030.606455  0.013019   94187.365011   89577.882901    0.011100   
2025-04-30   94256.359463 -0.008147   94284.535522   90335.016256    0.012419   
2025-05-01   94235.753310 -0.000219   94374.578522   91058.585820    0.010693   
...                   ...       ...            ...            ...         ...   
2025-07-08  108300.716758 -0.008373  108259.189104  107710.918038    0.008824   
2025-07-09  108953.191877  0.006025  108736.302266  107923.569650    0.009372   
2025-07-10  111327.530542  0.021792  109093.886025  108215.641086    0.015088   
2025-07-11  115879.650301  0.040889  109990.663948  108851.043783    0

In [83]:
# ##training function
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# def train_volatility_model(df):
#     df=df.copy()
#     features=["returns","ma7","ma14","volatility","rsi","sentiment"]
              
#     X=df[features]
#     y=df['high_volatility']
#     X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,shuffle=False)
    
#     model=RandomForestClassifier(n_estimators=100,random_state=42)
#     model.fit(X_train,y_train)

#     #Evaluate the model
#     y_pred=model.predict(X_test)
#     ##print classification report
#     report=classification_report(y_pred,y_test)
#     print("report",report)
#     ##print accuracy report
#     accuracy=accuracy_score(y_pred,y_test)
#     print("accuracy_score",accuracy)
#     return model

In [84]:
##training function
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

def train_volatility_model_with_grid_search(df):
    df=df.copy()
    features=["returns","ma7","ma14","volatility","rsi","sentiment"]
              
    X=df[features]
    y=df['high_volatility']
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,shuffle=False)

    param_grid={
        "n_estimators":[50,100,150],
        "max_depth":[3,5,10,None],
        "min_samples_split": [2, 5],
    }

    
    grid=GridSearchCV(estimator=RandomForestClassifier(random_state=42),param_grid=param_grid,cv=3,verbose=1,n_jobs=-1,scoring="f1",)
    grid.fit(X_train,y_train)

    best_model = grid.best_estimator_
    y_pred=best_model.predict(X_test)
    
    ##print classification report
    report=classification_report(y_test,y_pred)
    print("report",report)
    ##print accuracy report
    accuracy=accuracy_score(y_pred,y_test)
    print("accuracy_score",accuracy)
    return best_model

In [85]:
##train the model for one coin
# btc_model=train_volatility_model(labelled_data['bitcoin'])
# print("btc_model",btc_model)
import joblib

##Train for all top 5 coins
models = {}
for coin_id,df in labelled_data.items():
    models['coin_id']=train_volatility_model_with_grid_search(df)
    joblib.dump(model, f"models/{coin_id}_model.pkl")
    

print("MODELS",models)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
report               precision    recall  f1-score   support

           0       0.67      1.00      0.80         8
           1       1.00      0.50      0.67         8

    accuracy                           0.75        16
   macro avg       0.83      0.75      0.73        16
weighted avg       0.83      0.75      0.73        16

accuracy_score 0.75
Fitting 3 folds for each of 24 candidates, totalling 72 fits
report               precision    recall  f1-score   support

           0       0.67      0.67      0.67         6
           1       0.80      0.80      0.80        10

    accuracy                           0.75        16
   macro avg       0.73      0.73      0.73        16
weighted avg       0.75      0.75      0.75        16

accuracy_score 0.75
Fitting 3 folds for each of 24 candidates, totalling 72 fits
report               precision    recall  f1-score   support

           0       0.75      0.30      0.43    

In [86]:
import os

# Create folder if not exists
os.makedirs("data", exist_ok=True)

# Save each coin’s feature dataframe
for coin_id, df in labelled_data.items():
    df.to_csv(f"data/{coin_id}_features.csv")

print("✅ Feature CSVs saved to /data/")


✅ Feature CSVs saved to /data/


In [87]:
import joblib
os.makedirs("models", exist_ok=True)

for coin_id, model in models.items():
    joblib.dump(model, f"models/{coin_id}_model.pkl")

print("✅ Models saved to /models/")


✅ Models saved to /models/
