In [1]:
import pandas as pd
import yfinance as yahooFinance
import os
import pandas_ta as ta  # Technical Analysis Library
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle 
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
import csv


In [2]:

symbol_df = pd.read_csv('../data/nifty500list.csv')

In [3]:

for i in range(len(symbol_df)):
    
    GetFacebookInformation = yahooFinance.Ticker(f'{symbol_df["Stock"][i]}.NS')
    df = GetFacebookInformation.history(period="max")
    df.to_csv(f'../data/stock_data/{symbol_df["Stock"][i]}.csv')

In [4]:


# Directory containing CSV files
directory = "../data/stock_data"

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):  # Check if the file is a CSV
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)  # Read CSV into a DataFrame
        df['Date'] = pd.to_datetime(df['Date'])
        df['Day'] = df['Date'].dt.day_name()
        df = df.loc[:, ~df.columns.str.startswith("Unnamed")]
        df.to_csv(f'../data/stock_data/{filename.split(".")[0]}.csv',index=False)


In [5]:
accuracies = {}

# === Load Stock Data ===
directory = "../data/stock_data"

# Loop through all files in the directory
for filename in tqdm(os.listdir(directory)):
    if filename.endswith(".csv"):  # Check if the file is a CSV
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        df = df[df['Date'] < '2025-02-07 00:00:00+05:30']

        try:
            # Ensure Date is a proper datetime type
            df['Date'] = pd.to_datetime(df['Date'])
            df.set_index('Date', inplace=True)


            # Trend Indicators
            df['SMA_10'] = df.ta.sma(length=10)  # Simple Moving Average
            df['SMA_20'] = df.ta.sma(length=20)
            df['SMA_50'] = df.ta.sma(length=50)
            df['SMA_100'] = df.ta.sma(length=100)
            # df['SMA_200'] = df.ta.sma(length=200)
            df['EMA_5'] = df.ta.ema(length=5)
            df['EMA_9'] = df.ta.ema(length=9)  # Exponential Moving Average
            df['EMA_20'] = df.ta.ema(length=20)
            df['EMA_50'] = df.ta.ema(length=50)
            df['EMA_100'] = df.ta.ema(length=100)
            # df['EMA_200'] = df.ta.ema(length=200)
            
            df['ADX'] = df.ta.adx(length=14)['ADX_14']
            
            df['A/D'] = df.ta.ad()

            df['RSI'] = df.ta.rsi(length=14)  # Relative Strength Index
            
            df[['StochK_14_3_3_14', 'StochD_14_3_3_14']] = df.ta.stoch()  # Stochastic Oscillator
            df[['StochK_3_3_14_14', 'StochD_3_3_14_14']] = df.ta.stoch(k=3,d=3,smooth_k=14)
            
            df['WilliamsR'] = df.ta.willr()  # Williams %R

            # Volume-Based Indicators
            df['OBV'] = df.ta.obv()  # On-Balance Volume
            df['VWAP'] = df.ta.vwap()  # Volume Weighted Average Price

            # Volatility Indicators
            df['ATR'] = df.ta.atr()  # Average True Range
            df['StdDev'] = df.ta.stdev(length=20)  # Standard Deviation
            
            df = df[df['Day'] == 'Friday']

            # === Define Weekly Trading Signals (Target Variable) ===
            df['Signal'] = np.where(df['Close'].shift(-1) > (df['Close'] + df['Close']*0.03), 1, 0)  # 1 = Buy, 0 = Sell

            # Drop NaN values that may have been introduced
            df.dropna(inplace=True)

            # === Prepare Data for XGBoost ===
            features = df.drop(columns=['Signal','Dividends', 'Stock Splits','Day'])
            target = df['Signal']

            X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
            
            scaler = StandardScaler()
            std_model = scaler.fit(X_train)
            
            pickle.dump(std_model, open(f'../data/standard_scaler/{filename.split(".")[0]}.pkl', "wb"))
            
            X_train_scaled = pd.DataFrame(std_model.transform(X_train), columns=X_train.columns, index=X_train.index)
            X_test_scaled = pd.DataFrame(std_model.transform(X_test), columns=X_test.columns, index=X_test.index)

            # Convert Data to XGBoost DMatrix Format
            dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
            dtest = xgb.DMatrix(X_test_scaled, label=y_test)

            # === Define XGBoost Parameters ===
            params = {
                'objective': 'binary:logistic',  # Binary classification (0 or 1)
                'eval_metric': 'logloss',
                'max_depth': 10,
                'learning_rate': 0.2,
                'n_estimators': 1000,
                'subsample': 0.8,
                'colsample_bytree': 0.8
            }

            # === Train XGBoost Model ===
            model = xgb.train(params, dtrain, num_boost_round=200)

            # === Make Predictions ===
            y_pred = model.predict(dtest)
            y_pred_binary = np.where(y_pred > 0.5, 1, 0)  # Convert probability to 0 (Sell) / 1 (Buy)

            # === Evaluate Accuracy ===
            accuracy = accuracy_score(y_test, y_pred_binary)
            accuracies[f'{filename.split(".")[0]}'] = accuracy

            # === Save Weekly Predictions ===
            df['Predicted_Signal'] = model.predict(xgb.DMatrix(features))
            df['Predicted_Signal'] = np.where(df['Predicted_Signal'] > 0.5, 1, 0)

            df.to_csv(f'../data/pred_data/pred_{filename.split(".")[0]}.csv',index=False)

            # save
            pickle.dump(model, open(f'../data/model/xgb_{filename.split(".")[0]}.pkl', "wb"))
        except Exception as e:
            print(f'Error in {filename}: {e}')

 14%|█▎        | 68/497 [00:22<03:08,  2.27it/s]

Error in BERGEPAINT.csv: Input X contains infinity or a value too large for dtype('float64').


100%|██████████| 497/497 [03:08<00:00,  2.63it/s]


In [6]:

header = ['Stock', 'accuracy']
with open('../data/accuracies.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(header)
    for key, value in accuracies.items():
       writer.writerow([key, value])

In [7]:

for i in range(len(symbol_df)):
    
    GetFacebookInformation = yahooFinance.Ticker(f'{symbol_df["Stock"][i]}.NS')
    df = GetFacebookInformation.history(period="6mo")
    df.to_csv(f'../data/fut_data/{symbol_df["Stock"][i]}.csv')

In [8]:
directory = "../data/fut_data"
fut_pred = {}

# Loop through all files in the directory
for filename in tqdm(os.listdir(directory)):
    if filename.endswith(".csv"):  # Check if the file is a CSV
        file_path = os.path.join(directory, filename)
        
        input_data = pd.read_csv(file_path)
        # print(filename)
        
        

        # Ensure that 'Date' is a proper datetime type if it's included in the data
        input_data['Date'] = pd.to_datetime(input_data['Date'])
        
        # input_data = input_data[input_data['Date'] < '2025-02-01 00:00:00+05:30']
        # print(input_data)
        input_data.set_index('Date', inplace=True)

        input_data['SMA_10'] = input_data.ta.sma(length=10)  # Simple Moving Average
        input_data['SMA_20'] = input_data.ta.sma(length=20)
        input_data['SMA_50'] = input_data.ta.sma(length=50)
        input_data['SMA_100'] = input_data.ta.sma(length=100)
        # input_data['SMA_200'] = input_data.ta.sma(length=200)
        
        input_data['EMA_5'] = input_data.ta.ema(length=5)
        input_data['EMA_9'] = input_data.ta.ema(length=9)  # Exponential Moving Average
        input_data['EMA_20'] = input_data.ta.ema(length=20)
        input_data['EMA_50'] = input_data.ta.ema(length=50)
        input_data['EMA_100'] = input_data.ta.ema(length=100)
        # input_data['EMA_200'] = input_data.ta.ema(length=200)
        
        input_data['ADX'] = input_data.ta.adx(length=14)['ADX_14']
        
        input_data['A/D'] = input_data.ta.ad()

        input_data['RSI'] = input_data.ta.rsi(length=14)  # Relative Strength Index
        # input_data[['StochK', 'StochD']] = input_data.ta.stoch()  # Stochastic Oscillator
        input_data[['StochK_14_3_3_14', 'StochD_14_3_3_14']] = input_data.ta.stoch()  # Stochastic Oscillator
        input_data[['StochK_3_3_14_14', 'StochD_3_3_14_14']] = input_data.ta.stoch(k=3,d=3,smooth_k=14)
            
        input_data['WilliamsR'] = input_data.ta.willr()  # Williams %R

        # Volume-Based Indicators
        input_data['OBV'] = input_data.ta.obv()  # On-Balance Volume
        input_data['VWAP'] = input_data.ta.vwap()  # Volume Weighted Average Price

        # Volatility Indicators
        input_data['ATR'] = input_data.ta.atr()  # Average True Range
        input_data['StdDev'] = input_data.ta.stdev(length=20)  # Standard Deviation
        # print(input_data.head(5))

        features = input_data[input_data.index == '2025-02-07 00:00:00+05:30']
        features = features.drop(columns=['Dividends', 'Stock Splits'])
        std_model = pickle.load(open(f'../data/standard_scaler/{filename.split(".")[0]}.pkl', 'rb'))
        scaled_feat = pd.DataFrame(std_model.transform(features), columns=features.columns, index=features.index)

        # print(features)
        dinput = xgb.DMatrix(scaled_feat)
        loaded_model = pickle.load(open(f'../data/model/xgb_{filename.split(".")[0]}.pkl', 'rb'))
        predictions = loaded_model.predict(dinput)
        predictions_binary = np.where(predictions > 0.5, 1, 0)
        fut_pred[f'{filename.split(".")[0]}'] = [predictions_binary,predictions[0]]

100%|██████████| 497/497 [00:22<00:00, 22.52it/s]


In [9]:
weekly_prediction = pd.DataFrame(
    [(symbol, values[0][0], values[1]) for symbol, values in fut_pred.items()],
    columns=["Stock", "Signal", "Probability"]
)

In [10]:
acc = pd.read_csv('../data/accuracies.csv')

In [11]:
weekly_prediction = weekly_prediction.merge(acc, on='Stock', how='inner')

In [12]:
weekly_prediction.to_csv('../data/weekly_prediction(10feb-14feb).csv',index=False)

In [20]:
weekly_prediction[(weekly_prediction['Signal'] == 1)&(weekly_prediction['accuracy'] < 0.7)]

Unnamed: 0,Stock,Signal,Probability,accuracy
0,360ONE,1,0.856296,0.632653
5,ABB,1,0.761892,0.686636
32,APLAPOLLO,1,0.937256,0.66129
67,BEML,1,0.598544,0.562212
136,DOMS,1,0.79169,0.5
143,ELECON,1,0.675584,0.68
160,FSL,1,0.661067,0.613333
271,KEI,1,0.667993,0.636872
272,KFINTECH,1,0.792564,0.470588
276,KNRCON,1,0.690969,0.68125


- FSL
- KFINTECH
- METROPOLIS
- PRESTIGE
- TANLA
- TORNTPOWER
- UPL

In [None]:
success_stocks = []
for i in tqdm(range(len(symbol_df))):
    GetFacebookInformation = yahooFinance.Ticker(f'{symbol_df["Stock"][i]}.NS')
    df = GetFacebookInformation.history(period="1mo")
    df.reset_index(level=0,inplace=True)
    target_date = '2025-01-31 00:00:00+05:30'
    target_column = df[df['Date'] == target_date]
    df = df[df['Date'] > target_date].drop(['Open', 'High', 'Low', 'Volume', 'Dividends', 'Stock Splits'],axis=1)
    df['Target Close'] = target_column['Close'].values[0]
    df['Difference'] = df['Close'] - df['Target Close']
    df['Percentage'] = df['Difference'] / df['Target Close'] *100
    prediction_data = weekly_prediction[weekly_prediction['Stock'] == symbol_df["Stock"][i]]
    df['Signal'] = prediction_data['Signal'].values[0]
    df['Probability'] = prediction_data['Probability'].values[0]
    df['accuracy'] = prediction_data['accuracy'].values[0]
    if((df['Percentage'] > 3) &(df['Signal'] == 1)).any():
        success_stocks.append([symbol_df["Stock"][i], prediction_data['Probability'].values[0], prediction_data['accuracy'].values[0]])
    df.to_csv(f'../data/after_week_pred/{symbol_df["Stock"][i]}.csv',index=False)
    

In [None]:
failed_stocks = []
for i in tqdm(range(len(symbol_df))):
    GetFacebookInformation = yahooFinance.Ticker(f'{symbol_df["Stock"][i]}.NS')
    df = GetFacebookInformation.history(period="1mo")
    df.reset_index(level=0,inplace=True)
    target_date = '2025-01-31 00:00:00+05:30'
    target_column = df[df['Date'] == target_date]
    df = df[df['Date'] > target_date].drop(['Open', 'High', 'Low', 'Volume', 'Dividends', 'Stock Splits'],axis=1)
    df['Target Close'] = target_column['Close'].values[0]
    df['Difference'] = df['Close'] - df['Target Close']
    df['Percentage'] = df['Difference'] / df['Target Close'] *100
    prediction_data = weekly_prediction[weekly_prediction['Stock'] == symbol_df["Stock"][i]]
    df['Signal'] = prediction_data['Signal'].values[0]
    df['Probability'] = prediction_data['Probability'].values[0]
    df['accuracy'] = prediction_data['accuracy'].values[0]
    if((df['Percentage'] > 3) &(df['Signal'] == 0)).any():
        failed_stocks.append([symbol_df["Stock"][i], prediction_data['Probability'].values[0], prediction_data['accuracy'].values[0]])
    # df.to_csv(f'../data/after_week_pred/{symbol_df["Stock"][i]}.csv',index=False)