<a href="https://colab.research.google.com/github/praneeth776/Stock-Market-Prediction/blob/main/utils.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/praneeth776/Stock-Market-Prediction.git

Cloning into 'Stock-Market-Prediction'...
remote: Enumerating objects: 115, done.[K
remote: Counting objects: 100% (115/115), done.[K
remote: Compressing objects: 100% (101/101), done.[K
remote: Total 115 (delta 19), reused 90 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (115/115), 4.87 MiB | 2.88 MiB/s, done.
Resolving deltas: 100% (19/19), done.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import mean_squared_error, r2_score
from functools import reduce

ModuleNotFoundError: No module named 'pandas'

In [9]:
!pwd


/content


In [4]:
def load_all_test_sets(test_path, merge = True):
  '''
  merges them side-by-side by date
  head for each is Date--Open--High--Low--Close--Adjusted--Returns--Volume
  '''
  dfs = []
  for i in range(1, 6):
      file_path = os.path.join(test_path, f"test_{i}.csv")
      df = pd.read_csv(file_path)

      df['Date'] = pd.to_datetime(df['Date'])

      df.rename(columns={
            "Open": f"open_{i}",
            "High": f"high_{i}",
            "Low": f"low_{i}",
            "Close": f"close_{i}",
            "Adjusted": f"adjusted_{i}",
            "Returns": f"return_{i}",
            "Volume": f"volume_{i}"
        }, inplace=True)

      df.sort_values(by='Date', inplace=True)
      dfs.append(df)

  if merge:
    merged_df = reduce(lambda left, right: pd.merge(left, right, on="Date", how="outer"), dfs)
    merged_df = add_indices(merged_df)
    return merged_df
  else:
    return [add_indices(df) for df in dfs]

def load_indices(data_path):
  indices = {
    "dj": "Dow_Jones.csv",
    "nasdaq": "NASDAQ.csv",
    "SP500": "SP500.csv"
  }
  idx_df = []
  for key, filename in indices.items():
    index_df = pd.read_csv(os.path.join(data_path, filename))
    index_df['Date'] = pd.to_datetime(index_df['Date'])
    index_df.rename(columns={"Returns": f"returns_{key}"}, inplace =True)
    idx_df.append(index_df[['Date', f"returns_{key}"]])
  return idx_df

def add_indices(df):
  idx_path = "/content/drive/MyDrive/stock-price-prediction-challenge/train/indices"
  idx_df = load_indices(idx_path)
  merged_df = reduce(lambda left, right: pd.merge(left, right, on="Date", how="outer"), [df] + idx_df)
  merged_df.sort_values('Date', inplace=True)
  merged_df.reset_index(drop=True, inplace=True)
  return merged_df



In [44]:
test_data = load_all_test_sets("/content/drive/MyDrive/stock-price-prediction-challenge/test", True)
test_data.head()

Unnamed: 0,Date,open_1,high_1,low_1,close_1,adjusted_1,return_1,volume_1,open_2,high_2,...,open_5,high_5,low_5,close_5,adjusted_5,return_5,volume_5,returns_dj,returns_nasdaq,returns_SP500
0,2015-01-02,,,,,,,,,,...,,,,,,,,,,
1,2015-01-05,55.889999,57.52,55.639999,56.990002,56.990002,0.014057,593600.0,136.585373,138.965851,...,50.060001,50.939999,49.57,49.830002,36.134575,-0.013267,182600.0,-0.01858,-0.015706,-0.018278
2,2015-01-06,57.330002,57.939999,56.790001,57.450001,57.450001,0.008072,825000.0,138.048782,138.370728,...,49.830002,50.369999,48.639999,49.110001,35.612473,-0.014449,192000.0,-0.007428,-0.012859,-0.008893
3,2015-01-07,57.970001,59.75,57.450001,59.57,59.57,0.036902,1053500.0,137.482925,140.458542,...,49.48,49.959999,49.119999,49.73,36.062061,0.012624,137300.0,0.012254,0.01257,0.01163
4,2015-01-08,60.0,61.669998,59.740002,61.439999,61.439999,0.031392,1289100.0,141.443909,143.16098,...,50.110001,51.240002,50.110001,50.59,36.685692,0.017293,496600.0,0.018388,0.018432,0.017888


In [46]:
train_path = "/content/drive/MyDrive/stock-price-prediction-challenge/train"

def load_train_data(train_path):
  '''
  Loads all individual stock CSVs from train/stocks
  concatenates them into one df with a 'ticker' column. ticker = company name
  then loads major indices from train/indices and merges their returns by date.

  this seems to be the usual kaggle practice
  '''
  stocks_path = os.path.join(train_path, "stocks")

  # Load and concatenate all stocks
  stock_dfs = []
  for filename in os.listdir(stocks_path):
      if filename.endswith(".csv"):
          ticker = filename.replace(".csv", "")
          df = pd.read_csv(os.path.join(stocks_path, filename))

          # Ensure Date is datetime and add ticker column
          df['Date'] = pd.to_datetime(df['Date'])
          df['ticker'] = ticker

          stock_dfs.append(df)

  stocks_df = pd.concat(stock_dfs, ignore_index=True)
  stocks_df = add_indices(stocks_df)
  stocks_df.sort_values(by=['ticker', 'Date'], inplace=True)
  stocks_df.reset_index(drop=True, inplace=True)
  return stocks_df


In [47]:
train_data = load_train_data(train_path)
train_data.head()

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Adjusted,Returns,Volume,ticker,returns_dj,returns_nasdaq,returns_SP500
0,2015-01-05,AAPL,27.0725,27.1625,26.352501,26.5625,23.635283,-0.028172,257142000.0,AAPL,-0.01858,-0.015706,-0.018278
1,2015-01-06,AAPL,26.635,26.8575,26.157499,26.565001,23.637512,9.4e-05,263188400.0,AAPL,-0.007428,-0.012859,-0.008893
2,2015-01-07,AAPL,26.799999,27.049999,26.674999,26.9375,23.968964,0.014022,160423600.0,AAPL,0.012254,0.01257,0.01163
3,2015-01-08,AAPL,27.307501,28.0375,27.174999,27.9725,24.889904,0.038422,237458000.0,AAPL,0.018388,0.018432,0.017888
4,2015-01-09,AAPL,28.1675,28.3125,27.5525,28.002501,24.916595,0.001072,214798000.0,AAPL,-0.009521,-0.006782,-0.008404


In [None]:
class MinMaxScaler:
def __init__(self):
    self.min = None
    self.max = None

def fit(self, df, columns):
    self.min = df[columns].min()
    self.max = df[columns].max()

def transform(self, df, columns):
    return (df[columns] - self.min) / (self.max - self.min)

def fit_transform(self, df, columns):
    self.fit(df, columns)
    return self.transform(df, columns)


/content
drive  sample_data  Stock-Market-Prediction


In [None]:
class StandardScaler:
  #z-score scaler
    def __init__(self):
        self.means = None
        self.stds = None

    def fit(self, df, columns):
        self.means = df[columns].mean()
        self.stds = df[columns].std(ddof=0)  # population std

    def transform(self, df, columns):
        return (df[columns] - self.means) / self.stds

    def fit_transform(self, df, columns):
        self.fit(df, columns)
        return self.transform(df, columns)


drive  sample_data  Stock-Market-Prediction
