In [1]:
import numpy as np
import pandas as pd
import csv 
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import K2Score
from itertools import combinations
from pgmpy.estimators import BDeuScore
import sys
import time

In [2]:
def preprocess_data(ticker):
    data = pd.read_csv(f'ohlc_data_{ticker}.csv')  # Read CSV for current ticker
    
    columns_to_remove = ['Open', 'High', 'Low', 'Volume', 'Dividends', 'Stock Splits', 'Date']
    data = data.drop(columns=columns_to_remove, axis=1)
    data = data.rename_axis('day')
    data['day-1_close'] = data['Close'].shift(-1)
    data = data.iloc[:data.shape[0] - 1]
    data = data.dropna()
    
    # Calculate median of the 'Close' column (excluding first two and last two values)
    median_close = data['Close'][2:-2].median()
    # Calculate bins based on the median_close
    bins = [float('-inf'), -median_close* 0.008, median_close* 0.003, float('inf')]
    
    data[ticker] = pd.cut(
        data['day-1_close'] - data['Close'],
        bins=bins,
        labels=[-1, 0, 1]
    )
    data.rename(columns={ticker: f'{ticker}'}, inplace=True)
    columns_to_remove = ['Close', 'day-1_close']
    data = data.drop(columns=columns_to_remove, axis=1)
    
    results[ticker] = data
    all_data_frames.append(results[ticker])

In [3]:
all_data_frames = []
ticker_symbols = ['AAPL', 'GOOGL', 'MSFT','ADBE', 'AMZN', 'BA','BRK-B', 'CSCO','HD', 'HDFCBANK.NS',
                  'IBM','INFY.NS', 'INTC', 'JNJ','JPM', 'KO', 'MA','MCD', 'META','NFLX', 
                  'NVDA', 'PEP','PG','PYPL', 'RELIANCE.NS', 'T','TCS.NS', 'TSLA', 'V','WMT']  # List of ticker symbols

results = {}  # Dictionary to store results for each ticker

for ticker in ticker_symbols:
    preprocess_data(ticker)
    
final = pd.concat(all_data_frames, axis=1)
print(final.head())

    AAPL GOOGL MSFT ADBE AMZN BA BRK-B CSCO HD HDFCBANK.NS  ... NVDA PEP PG  \
day                                                         ...               
0      1     1    1    1    0  1     1    1  1          -1  ...    1   1  1   
1      1     1    1    1    1  1     1    1  1           1  ...    1   0  1   
2      1     1    1    1   -1 -1     1    0  0           1  ...   -1   0  0   
3     -1    -1   -1    0    0 -1    -1   -1 -1           0  ...   -1   0 -1   
4      1     1    0    0    1  0     0    1  1           0  ...    1   0  0   

    PYPL RELIANCE.NS  T TCS.NS TSLA  V WMT  
day                                         
0      1           0  0     -1   -1  1   0  
1      1           1  1      1    1  1   0  
2      1           1  0      1   -1  1   0  
3     -1           1  0      1   -1 -1  -1  
4      1          -1  0     -1   -1  0   0  

[5 rows x 30 columns]


In [4]:
data = [('GOOGL', 'AAPL'), ('HD', 'AAPL')]
sorted_data = sorted(data, key=lambda x: x[0])
model = BayesianNetwork(sorted_data)

In [5]:
from pgmpy.estimators import BayesianEstimator
cpd1 = BayesianEstimator(model, final).estimate_cpd('AAPL', prior_type="K2")
print(cpd1)

+----------+-----------+-----------+-----+----------+----------+---------------------+
| GOOGL    | GOOGL(-1) | GOOGL(-1) | ... | GOOGL(1) | GOOGL(1) | GOOGL(1)            |
+----------+-----------+-----------+-----+----------+----------+---------------------+
| HD       | HD(-1)    | HD(0)     | ... | HD(-1)   | HD(0)    | HD(1)               |
+----------+-----------+-----------+-----+----------+----------+---------------------+
| AAPL(-1) | 0.5       | 0.3       | ... | 0.125    | 0.0625   | 0.03571428571428571 |
+----------+-----------+-----------+-----+----------+----------+---------------------+
| AAPL(0)  | 0.4       | 0.5       | ... | 0.5      | 0.375    | 0.2857142857142857  |
+----------+-----------+-----------+-----+----------+----------+---------------------+
| AAPL(1)  | 0.1       | 0.2       | ... | 0.375    | 0.5625   | 0.6785714285714286  |
+----------+-----------+-----------+-----+----------+----------+---------------------+


In [6]:
def k2_score(data,final,ticker):
    all_combinations = []
    for r in range(1, len(data) + 1):
        combos = combinations(data, r)
        for combo in combos:
            # Convert the tuple to a list and append it
            combo_list = [item for sublist in combo for item in sublist]
            all_combinations.append(combo_list)
    null = []
    all_combinations.append(null)
    max_int = sys.maxsize
    k2score = K2Score(final)
    result = -max_int
    total_score = 0  
    combi = None
    
    for combo in all_combinations:
        k2_score = k2score.local_score(ticker, combo)
        if k2_score > result:
            result = k2_score
            combi = combo
    
    if combi is not None:
        print("Best combination according to K2 Score :", combi)
        print("Best Score according to K2 Score : " , result)
    else:
        print("No best combination found.")
        
    return combi, result


def bdeu_score(data,final,ticker):
    all_combinations = []
    for r in range(1, len(data) + 1):
        combos = combinations(data, r)
        for combo in combos:
            # Convert the tuple to a list and append it
            combo_list = [item for sublist in combo for item in sublist]
            all_combinations.append(combo_list)
    null = []
    all_combinations.append(null)
    max_int = sys.maxsize
    
    bdeu_score = BDeuScore(final)
    result = -max_int

    combi = None
    
    for combo in all_combinations:
        
        bdeu_local_score = bdeu_score.local_score(ticker, combo)
        if bdeu_local_score > result:
            result = bdeu_local_score
            combi = combo
    
    if combi is not None:
        print("Best combination according to BDeu Score :", combi)
        print("Best Score according to BDeu Score : " , result)
    else:
        print("No best combination found.")

    return combi, result

In [7]:
def create_k2score_df(data, final,ticker):
    k2_combination, k2_score_result = k2_score(data, final,ticker)
    all_tickers = [ticker[0] for ticker in data]

    score_df = pd.DataFrame({'Ticker': all_tickers})
    if k2_combination is not None:
        score_df['Parents(K2)'f'{ticker}'] = score_df['Ticker'].apply(lambda x: 1 if x in k2_combination else 0)


    return score_df


In [None]:
start_time = time.time()

# [13 - tickers]
data = [['GOOGL'], ['MSFT'], ['ADBE'], ['AMZN'], ['BA'], ['BRK-B'], ['CSCO'], ['HD'], ['HDFCBANK.NS'], ['IBM'], 
        ['INFY.NS'], ['INTC'], ['JNJ']]

merged_df = pd.DataFrame()

# Loop through each ticker in data
for ticker in data:
    ticker_name = ticker[0]
    
    # Remove the current ticker from data
    data_without_current_ticker = [t for t in data if t != ticker]
    
    # Create score_df excluding the current ticker
    score_df = create_k2score_df(data_without_current_ticker, final, ticker_name)
    print(score_df)
    merged_df = pd.concat([merged_df, score_df], ignore_index=True)

elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Best combination according to K2 Score : ['MSFT', 'AMZN', 'BRK-B']
Best Score according to K2 Score :  -84.75919949748214
         Ticker  Parents(K2)GOOGL
0          MSFT                 1
1          ADBE                 0
2          AMZN                 1
3            BA                 0
4         BRK-B                 1
5          CSCO                 0
6            HD                 0
7   HDFCBANK.NS                 0
8           IBM                 0
9       INFY.NS                 0
10         INTC                 0
11          JNJ                 0
Best combination according to K2 Score : ['GOOGL', 'ADBE', 'BRK-B']
Best Score according to K2 Score :  -83.33231173939703
         Ticker  Parents(K2)MSFT
0         GOOGL                1
1          ADBE                1
2          AMZN                0
3            BA                0
4         BRK-B                1
5          CSCO                0
6            HD                0
7   HDFCBANK.NS                0
8           IBM 