In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

INCREMENT = 0.01
MIN_DATA_POINTS = 4

# Read data from .xlsx file
file_path = '~/diversification-project/project_data.xlsx'
original_df = pd.read_excel(file_path, index_col=0)
original_df = original_df.apply(pd.to_numeric, errors='coerce')

# Calculate returns
returns_df = original_df.pct_change().dropna()

# Split returns (pre and post COVID)
covid_start_date = '2020-03-06'
pre_covid_df = returns_df[returns_df.index < covid_start_date]
post_covid_df = returns_df[returns_df.index >= covid_start_date]

# Separate elements for respective periods
datasets = {
	'Pre-COVID': pre_covid_df,
	'Post-COVID': post_covid_df,
	'Whole Period': returns_df
}

all_results_dict = {}

# Calculate max and min values to set the thresholds
max_val = returns_df['S&P 500'].max()
min_val = returns_df['S&P 500'].min()

# Create thresholds for cumulative analysis
positive_thresholds = list(np.arange(0, max_val, INCREMENT))
negative_thresholds = list(np.arange(0, min_val, -INCREMENT))

for period, data in datasets.items():
	results_dict = {}
	
	for asset in data.columns:
		if asset == 'S&P 500':  # Skip S&P 500 since it's the index
			continue

		mid_points = []
		correlations = []

		for threshold in negative_thresholds:
			subset = data[data['S&P 500'] < threshold]
	
			# Check for minimum data points
			if len(subset) < MIN_DATA_POINTS:
				continue
			
			corr = subset[asset].corr(subset['S&P 500'])
			if abs(corr) != 1:
				mid_points.append(threshold)
				correlations.append(corr)

		for threshold in positive_thresholds[1:]:
			subset = data[data['S&P 500'] > threshold]
			
			# Check for minimum data points
			if len(subset) < MIN_DATA_POINTS:
				continue
			
			corr = subset[asset].corr(subset['S&P 500'])
			if abs(corr) != 1:
				mid_points.append(threshold)
				correlations.append(corr)

		
		# Create dataframe for results
		df_asset = pd.DataFrame({
			'Mid-point': mid_points,
			'Correlation with S&P 500': correlations
		})

		results_dict[asset] = df_asset
	all_results_dict[period] = results_dict

# Plotting section
for asset_name in original_df.columns:
	if asset_name == 'S&P 500':
		continue

	plt.figure(figsize=(10, 6))

	for period, color in zip(['Pre-COVID', 'Post-COVID', 'Whole Period'], ['green', 'red', 'grey']):
		test_asset = all_results_dict[period][asset_name]
		
		# Sort the test_asset DataFrame by 'Mid-point' column before plotting
		test_asset = test_asset.sort_values(by='Mid-point')

		if 'Mid-point' in test_asset.columns and 'Correlation with S&P 500' in test_asset.columns:
			plt.plot(test_asset['Mid-point'], test_asset['Correlation with S&P 500'], marker='o', linestyle='-', color=color, label=period)

	plt.title(f"Conditional Correlation vs Mid-point for {asset_name}")
	plt.xlabel("Mid-point")
	plt.ylabel("Conditional Correlation with S&P 500")
	plt.legend()
	plt.grid(True)

	formatted_asset_name = ''.join(e for e in asset_name if e.isalnum())
	filename = f"{formatted_asset_name}_corr.png"
	plt.savefig(filename, dpi=300, bbox_inches='tight')
	plt.close()


# Create the summary DataFrame remains largely the same
summary_data = []

for asset_name in original_df.columns:
	if asset_name == 'S&P 500':
		continue

	asset_summary = {'asset': asset_name}
	for period in ['Pre-COVID', 'Post-COVID', 'Whole Period']:
		test_asset = all_results_dict[period][asset_name]
		avg_corr = test_asset['Correlation with S&P 500'].mean()
		asset_summary[period] = avg_corr

	summary_data.append(asset_summary)

summary_df = pd.DataFrame(summary_data)
