In [2]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller, kpss, acf, pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Data Extraction
def resample_and_get_periods(csv_path, interval='5T'):
   """
   Load data, resample to 5 min OHLC, perform quality checks, and split into periods

   Parameters:
   - csv_path: path to CSV file
   - interval: resampling interval (default '5T' for 5 minutes)

   Returns: Dictionary of quality-checked, resampled dataframes for different periods
   """
   # Load data
   def load_and_clean(csv_path):
       df = pd.read_csv(csv_path)
       df['date'] = pd.to_datetime(df['date'])
       return df.set_index('date').sort_index()

   # Quality checks
   def quality_checks(df, period_name):
       issues = []

       # Check for missing values
       missing = df.isnull().sum()
       if missing.any():
           issues.append(f"Missing values found: {missing[missing > 0]}")

       # Check for duplicates
       duplicates = df.index.duplicated().sum()
       if duplicates:
           issues.append(f"Found {duplicates} duplicate timestamps")

       # Check for price anomalies
       price_std = df['close'].std()
       price_mean = df['close'].mean()
       outliers = df[abs(df['close'] - price_mean) > 3 * price_std]
       if not outliers.empty:
           issues.append(f"Found {len(outliers)} potential price outliers")

       # Check for gaps in time series
       time_diff = df.index.to_series().diff()
       gaps = time_diff[time_diff > pd.Timedelta(minutes=6)]  # More than 6 min gap
       if not gaps.empty:
           issues.append(f"Found {len(gaps)} time gaps > 6 minutes")

       # Print issues for this period
       if issues:
           print(f"\nQuality issues in {period_name}:")
           for issue in issues:
               print(f"- {issue}")

       return len(issues) == 0

   # Resample to 5 min OHLC
   def resample_ohlc(df, interval):
       resampled = df.resample(interval).agg({
           'open': 'first',
           'high': 'max',
           'low': 'min',
           'close': 'last',
           'volume': 'sum'
       })
       return resampled.dropna()  # Remove any incomplete periods

   print("Loading and preprocessing data...")
   df = load_and_clean(csv_path)

   # Resample full dataset
   df_resampled = resample_ohlc(df, interval)

   # Get end date
   end_date = df_resampled.index.max()

   # Define periods and create slices
   periods = {
       'last_6m': (end_date - pd.DateOffset(months=6), end_date),
       'last_to_last_6m': (end_date - pd.DateOffset(months=12),
                          end_date - pd.DateOffset(months=6)),
       'last_1y': (end_date - pd.DateOffset(years=1), end_date),
       'last_to_last_1y': (end_date - pd.DateOffset(years=2),
                          end_date - pd.DateOffset(years=1)),
       'last_2y': (end_date - pd.DateOffset(years=2), end_date),
       'last_to_last_2y': (end_date - pd.DateOffset(years=4),
                          end_date - pd.DateOffset(years=2))
   }

   datasets = {}
   print("\nCreating and validating period datasets...")

   for name, (start, end) in periods.items():
       # Slice data
       period_data = df_resampled[start:end].copy()

       # Perform quality checks
       is_clean = quality_checks(period_data, name)

       # Store data and metadata
       datasets[name] = {
           'data': period_data,
           'metadata': {
               'start_date': period_data.index.min(),
               'end_date': period_data.index.max(),
               'records': len(period_data),
               'trading_days': len(set(period_data.index.date)),
               'quality_passed': is_clean
           }
       }

   # Print summary
   print("\nPeriod Summaries:")
   print("="*70)
   for name, dataset in datasets.items():
       meta = dataset['metadata']
       print(f"\n{name}:")
       print(f"Date Range: {meta['start_date']} to {meta['end_date']}")
       print(f"Records: {meta['records']}, Trading Days: {meta['trading_days']}")
       print(f"Quality Check: {'✓' if meta['quality_passed'] else '✗'}")

   return datasets

# Usage
REPO_PATH = "/content/drive/MyDrive/Colab Notebooks/plusEV-"
csv_path = os.path.join(REPO_PATH, "nifty50_1min_2015_to_2024.csv")
datasets = resample_and_get_periods(csv_path)

# # Access data for a period
# df0 = datasets['last_6m']['data']
# # last_6m_metadata = datasets['last_6m']['metadata']
# df0 = datasets['last_to_last_6m']['data']
# df0 = datasets['last_1y']['data']
# df0 = datasets['last_to_last_1y']['data']
# df0 = datasets['last_2y']['data']
# df0 = datasets['last_to_last_2y']['data']
# df0["Return"] = df0["close"].pct_change()
# df0 = df0["Return"].dropna()
# df2["Return"] = df2["close"].pct_change()
# df2 = df2["Return"].dropna()
# df3["Return"] = df3["close"].pct_change()
# df3 = df3["Return"].dropna()
# df4["Return"] = df4["close"].pct_change()
# df4 = df4["Return"].dropna()
# df5["Return"] = df5["close"].pct_change()
# df5 = df5["Return"].dropna()
# df6["Return"] = df6["close"].pct_change()
# df6 = df6["Return"].dropna()

# print(df1)

df_0 = pd.read_csv(csv_path)
df_0['date'] = pd.to_datetime(df_0['date'])
df_0 = df_0.set_index('date').sort_index()
df_0 = df_0.resample('5T').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum'
}).fillna(method='ffill')
df_0['Return'] = df_0['close'].pct_change()
df0 = df_0["Return"].dropna()
# midpoint = len(df0) // 2
# df0 = df0[midpoint:]
# gc.collect()

Loading and preprocessing data...


  resampled = df.resample(interval).agg({



Creating and validating period datasets...

Quality issues in last_6m:
- Found 124 time gaps > 6 minutes

Quality issues in last_to_last_6m:
- Found 126 time gaps > 6 minutes

Quality issues in last_1y:
- Found 250 time gaps > 6 minutes

Quality issues in last_to_last_1y:
- Found 247 time gaps > 6 minutes

Quality issues in last_2y:
- Found 497 time gaps > 6 minutes

Quality issues in last_to_last_2y:
- Found 496 time gaps > 6 minutes

Period Summaries:

last_6m:
Date Range: 2024-02-28 15:25:00+05:30 to 2024-08-28 15:25:00+05:30
Records: 9043, Trading Days: 123
Quality Check: ✗

last_to_last_6m:
Date Range: 2023-08-28 15:25:00+05:30 to 2024-02-28 15:25:00+05:30
Records: 9388, Trading Days: 127
Quality Check: ✗

last_1y:
Date Range: 2023-08-28 15:25:00+05:30 to 2024-08-28 15:25:00+05:30
Records: 18430, Trading Days: 249
Quality Check: ✗

last_to_last_1y:
Date Range: 2022-08-29 09:15:00+05:30 to 2023-08-28 15:25:00+05:30
Records: 18537, Trading Days: 248
Quality Check: ✗

last_2y:
Date 

  df_0 = df_0.resample('5T').agg({
  df_0 = df_0.resample('5T').agg({


In [5]:
# Add this at the start of your notebook
def setup_wandb():
    try:
        import wandb
        # Check if already logged in
        if wandb.api.api_key is None:
            # Your API key
            WANDB_API_KEY = "641b305133f7d8345e710ecf6c9d83fea7e225f1"
            os.environ["WANDB_API_KEY"] = WANDB_API_KEY

        print("WandB setup complete!")
        return True
    except Exception as e:
        print(f"Error setting up WandB: {str(e)}")
        return False

# Use it in your notebook
setup_wandb()

WandB setup complete!


True

In [6]:
df0

Unnamed: 0_level_0,Return
date,Unnamed: 1_level_1
2015-01-09 09:20:00+05:30,-0.000024
2015-01-09 09:25:00+05:30,-0.000825
2015-01-09 09:30:00+05:30,-0.000681
2015-01-09 09:35:00+05:30,-0.000609
2015-01-09 09:40:00+05:30,0.000254
...,...
2024-08-28 15:05:00+05:30,0.000102
2024-08-28 15:10:00+05:30,-0.000369
2024-08-28 15:15:00+05:30,-0.000471
2024-08-28 15:20:00+05:30,0.000152


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import acf, pacf
import wandb
from typing import Dict, List, Tuple
from datetime import datetime

class TimeSeriesAnalysis:
    ACF_KEY: str = 'acf_values'
    PACF_KEY: str = 'pacf_values'
    ACF_LAGS_KEY: str = 'acf_lags'
    PACF_LAGS_KEY: str = 'pacf_lags'

    def __init__(self, returns_series, max_lags=200):
        self.returns = returns_series
        self.max_lags = max_lags

        wandb.init(
            project="Time Series Analysis of Nifty50 5 min ohlc",
            name=f"ACF_PACF_last_half_data_{datetime.now().strftime('%Y%m%d_%H%M')}",
            group="ACF_PACF_analysis",
            config={
                "data_points": len(returns_series),
                "analysis_type": "ACF_PACF",
                "data_range": f"{returns_series.index[0]} to {returns_series.index[-1]}"
            },
            tags=["ACF", "PACF", "ARIMA_parameter_calcualtion"]
        )

    def compute_acf_pacf(self) -> Dict[str, np.ndarray]:
        acf_values = acf(self.returns, nlags=self.max_lags, fft=True)
        pacf_values = pacf(self.returns, nlags=self.max_lags, method='yw')

        # Create a DataFrame for all computed values
        values_df = pd.DataFrame({
            'Lag': range(len(acf_values)),
            'ACF': acf_values,
            'PACF': pacf_values
        })

        # Log as table instead of individual values
        wandb.log({
            'ACF_PACF_Values': wandb.Table(dataframe=values_df.round(4))
        })

        return {
            self.ACF_KEY: acf_values,
            self.PACF_KEY: pacf_values
        }

    def plot_acf_pacf(self, acf_values: np.ndarray, pacf_values: np.ndarray, max_display_lags: int = 50) -> None:
        try:
            plt.figure(figsize=(15, 6))
            confidence_interval = 1.96 / np.sqrt(len(self.returns))

            # ACF Plot
            plt.subplot(1, 2, 1)
            plt.stem(range(min(len(acf_values), max_display_lags)),
                    acf_values[:max_display_lags])
            plt.axhline(y=confidence_interval, color='r', linestyle='--', alpha=0.5)
            plt.axhline(y=-confidence_interval, color='r', linestyle='--', alpha=0.5)
            plt.fill_between(range(max_display_lags),
                            confidence_interval,
                            -confidence_interval,
                            color='gray',
                            alpha=0.2)
            plt.title('Autocorrelation Function (ACF)')
            plt.xlabel('Lag')
            plt.ylabel('ACF')
            plt.grid(True, alpha=0.3)

            # PACF Plot
            plt.subplot(1, 2, 2)
            plt.stem(range(min(len(pacf_values), max_display_lags)),
                    pacf_values[:max_display_lags])
            plt.axhline(y=confidence_interval, color='r', linestyle='--', alpha=0.5)
            plt.axhline(y=-confidence_interval, color='r', linestyle='--', alpha=0.5)
            plt.fill_between(range(max_display_lags),
                            confidence_interval,
                            -confidence_interval,
                            color='gray',
                            alpha=0.2)
            plt.title('Partial Autocorrelation Function (PACF)')
            plt.xlabel('Lag')
            plt.ylabel('PACF')
            plt.grid(True, alpha=0.3)

            plt.tight_layout()
            wandb.log({'ACF_PACF_Plots': wandb.Image(plt)})
            plt.close()
        finally:
            plt.close('all')


    def find_significant_lags(self, acf_values, pacf_values, significance_level=0.05):
      confidence_interval = 1.96 / np.sqrt(len(self.returns))
      significant_acf_lags = [i for i, val in enumerate(acf_values) if abs(val) > confidence_interval]
      significant_pacf_lags = [i for i, val in enumerate(pacf_values) if abs(val) > confidence_interval]

      # Create detailed DataFrame for ACF significant lags
      acf_sig_df = pd.DataFrame({
          'Lag': significant_acf_lags,
          'ACF_Value': [acf_values[i] for i in significant_acf_lags],
          'Normalized_ACF': [abs(acf_values[i]) / confidence_interval for i in significant_acf_lags],
          'Is_Highly_Significant': [abs(acf_values[i]) / confidence_interval > 2 for i in significant_acf_lags]
      }).sort_values('Normalized_ACF', ascending=False)

      # Create detailed DataFrame for PACF significant lags
      pacf_sig_df = pd.DataFrame({
          'Lag': significant_pacf_lags,
          'PACF_Value': [pacf_values[i] for i in significant_pacf_lags],
          'Normalized_PACF': [abs(pacf_values[i]) / confidence_interval for i in significant_pacf_lags],
          'Is_Highly_Significant': [abs(pacf_values[i]) / confidence_interval > 2 for i in significant_pacf_lags]
      }).sort_values('Normalized_PACF', ascending=False)

      # Log detailed tables to wandb
      wandb.log({
          'ACF_Significant_Lags_Detail': wandb.Table(dataframe=acf_sig_df.round(4)),
          'PACF_Significant_Lags_Detail': wandb.Table(dataframe=pacf_sig_df.round(4))
      })

      # Create and log summary table
      summary_df = pd.DataFrame({
          'Type': ['ACF', 'PACF'],
          'Count': [len(significant_acf_lags), len(significant_pacf_lags)],
          'Highly_Significant_Count': [sum(acf_sig_df['Is_Highly_Significant']),
                                    sum(pacf_sig_df['Is_Highly_Significant'])],
          'Max_Normalized_Value': [acf_sig_df['Normalized_ACF'].max() if not acf_sig_df.empty else 0,
                                pacf_sig_df['Normalized_PACF'].max() if not pacf_sig_df.empty else 0],
          'Significant_Lags_List': [str(significant_acf_lags), str(significant_pacf_lags)]
      })

      wandb.log({
          'Significant_Lags_Summary': wandb.Table(dataframe=summary_df)
      })

      return {
          self.ACF_LAGS_KEY: significant_acf_lags,
          self.PACF_LAGS_KEY: significant_pacf_lags
      }



    def suggest_arima_orders(self, acf_values: np.ndarray, pacf_values: np.ndarray,
                        significant_lags: Dict[str, List[int]], d: int = 0) -> Dict[str, Tuple[int, int, int]]:
      try:
          suggestions = {}

          # [Previous validation code remains the same]
          if not isinstance(significant_lags, dict) or self.ACF_LAGS_KEY not in significant_lags or self.PACF_LAGS_KEY not in significant_lags:
              raise ValueError(f"significant_lags must be a dictionary with '{self.ACF_LAGS_KEY}' and '{self.PACF_LAGS_KEY}' keys")

          acf_sig_lags = sorted(significant_lags.get(self.ACF_LAGS_KEY, []))
          pacf_sig_lags = sorted(significant_lags.get(self.PACF_LAGS_KEY, []))

          # Calculate all suggestions as before
          p_conservative = next((lag for lag in pacf_sig_lags if lag > 0), 0)
          q_conservative = next((lag for lag in acf_sig_lags if lag > 0), 0)
          suggestions['conservative'] = (p_conservative, d, q_conservative)

          p_moderate = min(len([lag for lag in pacf_sig_lags if 0 < lag <= 10]), 3)
          q_moderate = min(len([lag for lag in acf_sig_lags if 0 < lag <= 10]), 3)
          suggestions['moderate'] = (p_moderate, d, q_moderate)

          seasonal_p = self._find_seasonal_pattern(pacf_values)
          seasonal_q = self._find_seasonal_pattern(acf_values)

          p_aggressive = min(max(p_moderate + 1, seasonal_p), 5)
          q_aggressive = min(max(q_moderate + 1, seasonal_q), 5)
          suggestions['aggressive'] = (p_aggressive, d, q_aggressive)

          if seasonal_p > 0 or seasonal_q > 0:
              suggestions['seasonal'] = (
                  seasonal_p if seasonal_p > 0 else 1,
                  d,
                  seasonal_q if seasonal_q > 0 else 1
              )

          # Create a single DataFrame for all ARIMA suggestions
          suggestions_df = pd.DataFrame([
            {
                'Approach': k.capitalize(),
                'p': v[0],
                'd': v[1],
                'q': v[2],
                'ARIMA_Order': f'ARIMA({v[0]},{v[1]},{v[2]})',
                'Description': self._get_approach_description(k, v[0], v[1], v[2]),
                'Complexity': self._get_complexity_score(v[0], v[1], v[2])
            }
            for k, v in suggestions.items()
          ])

          # Sort by complexity
          suggestions_df = suggestions_df.sort_values('Complexity')

          # Log enhanced table to wandb
          wandb.log({
              'ARIMA_Suggestions_Detail': wandb.Table(dataframe=suggestions_df)
          })

          return suggestions

      except Exception as e:
          print(f"Error in suggest_arima_orders: {e}")
          return {'conservative': (1, d, 1)}

    def _get_approach_description(self, approach, p, d, q):
      descriptions = {
          'conservative': 'Minimal model using first significant lags',
          'moderate': 'Balanced model using first 10 lags',
          'aggressive': 'Complex model considering more lags',
          'seasonal': 'Model accounting for seasonal patterns'
      }
      return descriptions.get(approach.lower(), 'Custom approach')

    def _get_complexity_score(self, p, d, q):
      return p + d + q  # Simple complexity score


    def _find_pattern_cutoff(self, values: np.ndarray, threshold: float = 0.1) -> int:
        """Find where the ACF/PACF pattern cuts off"""
        for i in range(1, len(values)-1):
            if abs(values[i]) < threshold and abs(values[i+1]) < threshold:
                return i
        return min(5, len(values)-1)  # Default to 5 if no clear cutoff



    def _find_seasonal_pattern(self, values: np.ndarray) -> int:
        """Identify potential seasonal patterns in ACF/PACF"""
        peaks = []
        for i in range(2, len(values)-1):
            if (values[i] > values[i-1] and values[i] > values[i+1] and
                abs(values[i]) > 0.1):
                peaks.append(i)

        if len(peaks) >= 2:
            # Look for regular spacing between peaks
            differences = np.diff(peaks)
            if len(set(differences)) == 1:
                return differences[0]
        return 0



    def run_analysis(self, d=0):
      """
      Run the complete analysis workflow.
      """
      try:
          # Start timing
          start_time = datetime.now()

          # Get system info
          import psutil
          import platform
          import sys

          # Compute ACF and PACF
          acf_pacf_values = self.compute_acf_pacf()

          # Plot ACF and PACF
          self.plot_acf_pacf(acf_pacf_values[self.ACF_KEY], acf_pacf_values[self.PACF_KEY])

          # Find significant lags
          significant_lags = self.find_significant_lags(
              acf_pacf_values[self.ACF_KEY],
              acf_pacf_values[self.PACF_KEY]
          )

          # Get ARIMA suggestions
          arima_suggestions = self.suggest_arima_orders(
              acf_pacf_values[self.ACF_KEY],
              acf_pacf_values[self.PACF_KEY],
              significant_lags,
              d=d
          )

          # Calculate execution time
          execution_time = (datetime.now() - start_time).total_seconds()

          # Get system metrics
          system_metrics = pd.DataFrame([{
              'CPU_Usage_Percent': psutil.cpu_percent(),
              'Memory_Usage_Percent': psutil.virtual_memory().percent,
              'Available_Memory_GB': round(psutil.virtual_memory().available / (1024**3), 2),
              'Total_Memory_GB': round(psutil.virtual_memory().total / (1024**3), 2),
              'Python_Version': platform.python_version(),
              'OS_Platform': platform.platform(),
              'CPU_Cores': psutil.cpu_count(),
              'Execution_Time_Seconds': execution_time
          }])

          # Create comprehensive final summary
          all_stats_summary = pd.DataFrame([{
              'Total_ACF_Lags': len(significant_lags[self.ACF_LAGS_KEY]),
              'Total_PACF_Lags': len(significant_lags[self.PACF_LAGS_KEY]),
              'Number_of_ARIMA_Models': len(arima_suggestions),
              'Input_Series_Length': len(self.returns),
              'Differencing_Order': d,
              'Max_Lag_Analyzed': self.max_lags,
              'Data_Range': f"{self.returns.index[0]} to {self.returns.index[-1]}",
              'Analysis_Timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
              'Execution_Time_Seconds': execution_time
          }])

          # Log summaries to wandb
          wandb.log({
              'Analysis_Complete_Summary': wandb.Table(dataframe=all_stats_summary),
              'System_Performance_Metrics': wandb.Table(dataframe=system_metrics)
          })

          # Log system metrics directly for tracking
          wandb.log({
              'CPU_Usage': psutil.cpu_percent(),
              'Memory_Usage': psutil.virtual_memory().percent,
              'Execution_Time': execution_time
          })

          # Finish wandb run
          wandb.finish()

          return {
              'significant_lags': significant_lags,
              'acf_pacf_values': acf_pacf_values,
              'arima_suggestions': arima_suggestions
          }

      except Exception as e:
          print(f"Error in run_analysis: {e}")
          wandb.finish()
          raise e

In [None]:
analysis = TimeSeriesAnalysis(
            returns_series=df0,
            max_lags=200
        )

        # Run the complete analysis
results = analysis.run_analysis()

0,1
CPU_Usage,▁
Execution_Time,▁
Memory_Usage,▁

0,1
CPU_Usage,25.7
Execution_Time,19.86537
Memory_Usage,15.2
