# 📊 Analysis: Column Mapping Justification

This notebook explores the statistical and structural relationships between the columns to justify the inferred mapping.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("sample_data_with_fabricated_columns.csv")
df.describe()

## A Self-Defined Custom Sliding Window DP Algorithm

### Working of the DP Algorithm

1. **Find candidates**
   - Figures out which columns could be Open or Close:
`oc_candidates = all columns MINUS known High, Low, Price and Volume.`

2. **Define all possible (Open, Close) pairs**

3. **Run a dynamic programming loop**

   - For each time step (row in the data):
       - For each possible (Open, Close) pair:
           - Compute the cost of transitioning from the previous pair to this one.
           - Cost = difference between today’s Open and yesterday’s Close, plus an extra penalty if you switch mappings.
   - Store the minimum cost path.

4. **Backtrack**
   - After the forward pass, backtrace to get the optimal sequence of states (mappings).

5. **Find the most frequent mapping**
   - The most common (Open, Close) pair along the optimal path is assumed to be the real mapping.

In [None]:
import pandas as pd
import numpy as np
from itertools import permutations

DATA_FILE_PATH = "/home/anshium/internship/apt-assignment/assessment_01/02_sample_data_with_fabricated_columns.csv"

KNOWN_HIGH = 'gamma'
KNOWN_LOW = 'omega'
KNOWN_VOLUME = 'neutronCount'
KNOWN_PRICE = 'pulse'

def solve_open_close_with_dp(file_path, high_col, low_col, volume_col, price_column, switching_penalty=0.1):
    print("Dynamic Programming Solver for Open/Close")
    
    df = pd.read_csv(file_path)
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])

    known_cols = [high_col, low_col, volume_col, price_column]
    oc_candidates = [c for c in df.columns if c not in known_cols]
    
    states = list(permutations(oc_candidates, 2))
    num_states = len(states)
    num_timesteps = len(df)
    
    print(f"Candidates for Open/Close: {oc_candidates}")
    print(f"Number of possible states (mappings): {num_states}")
    for i, state in enumerate(states):
        print(f"  State {i}: Open={state[0]}, Close={state[1]}")

    # cost_matrix[t, k]: min cost to be in state k at time t
    # backpointer_matrix[t, k]: which state at t-1 led to the min cost for state k at t
    cost_matrix = np.full((num_timesteps, num_states), np.inf)
    backpointer_matrix = np.zeros((num_timesteps, num_states), dtype=int)
    
    # At t=0, the cost to be in any state is 0 (no previous day to compare to)
    cost_matrix[0, :] = 0

    print("\nRunning forward pass of the Viterbi algorithm...")
    for t in range(1, num_timesteps):
        if t % 10000 == 0:
            print(f"Processing timestep {t}/{num_timesteps}")
            
        for j, current_state in enumerate(states):
            open_j = current_state[0]
            
            min_cost = np.inf
            best_prev_state_idx = -1
            
            for i, prev_state in enumerate(states):
                close_i = prev_state[1]
                
                # Cost = Previous Path Cost + Transition Cost + Switching Penalty
                transition_cost = abs(df[open_j].iloc[t] - df[close_i].iloc[t-1])
                penalty = switching_penalty if i != j else 0
                
                total_cost = cost_matrix[t-1, i] + transition_cost + penalty
                
                if total_cost < min_cost:
                    min_cost = total_cost
                    best_prev_state_idx = i
            
            cost_matrix[t, j] = min_cost
            backpointer_matrix[t, j] = best_prev_state_idx
            
    optimal_path = np.zeros(num_timesteps, dtype=int)
    
    optimal_path[-1] = np.argmin(cost_matrix[-1, :])
    
    for t in range(num_timesteps - 2, -1, -1):
        optimal_path[t] = backpointer_matrix[t + 1, optimal_path[t + 1]]

    print("\n--- Results ---")
    path_in_words = [f"O={states[i][0]}, C={states[i][1]}" for i in optimal_path]
    results_df = pd.DataFrame({'Optimal Mapping': path_in_words})
    
    most_likely_mapping_counts = results_df['Optimal Mapping'].value_counts()
    
    print("The most frequent mapping found across the entire dataset was:")
    print(most_likely_mapping_counts.head(1))
    
    return most_likely_mapping_counts

if __name__ == "__main__":
    most_likely_mapping = solve_open_close_with_dp(DATA_FILE_PATH, high_col=KNOWN_HIGH, low_col=KNOWN_LOW, volume_col=KNOWN_VOLUME, price_col=KNOWN_PRICE,switching_penalty=0.1)