In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis


In [4]:
class Explanatory_Analysis:
    def __init__(self, base_folder_path):
        self.base_folder_path = base_folder_path
        
    def load_data(self, file_path):
        data = pd.read_csv(file_path)
        data['Day'] = pd.to_datetime(data['Day'])
        data['ret'] = data['close'].pct_change()
        if 'Date' in data.columns:
            data['Date'] = pd.to_datetime(data['Date'])
            data = data[data['Date'].dt.time !=pd.to_datetime('09:30:00').time()]
        return data
    
    def check_interval(self, min_data, threshold = 66):
        daily_counts = min_data.groupby(min_data['Day']).size()
        valid_days = daily_counts[daily_counts > threshold].index
        
        return valid_days
    
    def filter_daily_data(self, daily_data, valid_days):
        daily_data = daily_data.iloc[1:]
        return daily_data[daily_data['Day'].isin(valid_days)]
        
        
    def compute_statistics(self, data):
        return {
            'Obs' : len(data),
            'Mean' : data['ret'].mean(),
            'Std' : data['ret'].std(),
            'Median' : data['ret'].median(),
            'Skewness' : skew(data['ret']),
            'Kurtosis' : kurtosis(data['ret'], fisher=False)
        }
        
    # def plot_data(self,data,ticker):
    #     plt.figure(figsize=(10,6))
    #     plt.plot(data['log_ret'])
    #     plt.title(f'{ticker} Log Returns')
    #     plt.xlabel('Time')
    #     plt.ylabel('Log Returns')
    #     plt.show()
        
        
    def analyse(self):
        results = []
        for folder in os.listdir(self.base_folder_path):
            folder_path = os.path.join(self.base_folder_path, folder)
            if os.path.isdir(folder_path):
                minute_files = [f for f in os.listdir(folder_path) if '_filtered_data' in f]
                daily_files = [f for f in os.listdir(folder_path) if 'daily' in f]
                for min_file, daily_file in zip(minute_files, daily_files):
                    min_data_path = os.path.join(folder_path, min_file)
                    daily_data_path = os.path.join(folder_path, daily_file)
                    
                    min_data = self.load_data(min_data_path)
                    daily_data = self.load_data(daily_data_path)
                    daily_data = daily_data.iloc[1:] 
                    
                    valid_days = self.check_interval(min_data)
                    
                    filtered_daily_data = self.filter_daily_data(daily_data, valid_days)
                    stats = self.compute_statistics(filtered_daily_data)
                    stats['Ticker'] = daily_file.split('_')[0]
                    results.append(stats)
        return pd.DataFrame(results)
            
    def dataframe_to_latex(self, results_df):
        latex_table = results_df.to_latex(index=False, longtable=True,
                                      caption="Summary statistics for return and log(RK) across different stocks.",
                                      label="tab:stats", 
                                      column_format='lcccccc',
                                      float_format="{:0.2f}".format)
        latex_table = "\\begin{table}[htbp]\n\\centering\n" + \
                    latex_table + \
                    "\\end{table}"
    
        return latex_table


    
        

In [5]:
analyser = Explanatory_Analysis('/Users/raphaelravinet/Code/BSE/Thesis/data')
analysis_results = analyser.analyse()
latex_table = analyser.dataframe_to_latex(analysis_results)

In [13]:
test = analysis_results[analysis_results['Obs'] > 5000]

In [14]:
print(test['Obs'].min())
print(test['Obs'].max())

5126
5159


In [8]:
analysis_results[analysis_results['Obs'] > 5000]

Unnamed: 0,Obs,Mean,Std,Median,Skewness,Kurtosis,Ticker
0,5129,0.001154,0.024312,0.000647,0.779542,16.932021,AMZN
1,5127,0.000679,0.019741,0.000705,-0.052879,8.314125,CAT
2,5159,0.001374,0.020808,0.000998,0.097474,8.260797,AAPL
3,5127,0.000181,0.014612,0.000447,-0.327849,11.041736,MMM
4,5127,0.00029,0.012612,0.00038,0.180266,15.569177,WMT
5,5128,0.00047,0.017771,0.000777,0.118549,24.013455,CVX
6,5130,0.000283,0.01992,0.00044,-0.152252,11.219289,INTC
7,5127,0.000521,0.021716,0.00038,0.793792,21.588702,GS
8,5127,0.000615,0.016337,0.000548,-0.000399,13.522239,HD
9,5127,0.000298,0.015833,0.000321,-0.988397,28.137418,MRK


In [22]:
print(latex_table)

\begin{table}[htbp]
\centering
\begin{longtable}{lcccccc}
\caption{Summary statistics for return and log(RK) across different stocks.} \label{tab:stats} \\
\toprule
Obs & Mean & Std & Median & Skewness & Kurtosis & Ticker \\
\midrule
\endfirsthead
\caption[]{Summary statistics for return and log(RK) across different stocks.} \\
\toprule
Obs & Mean & Std & Median & Skewness & Kurtosis & Ticker \\
\midrule
\endhead
\midrule
\multicolumn{7}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
5129 & 0.00 & 0.02 & 0.00 & 0.78 & 16.93 & AMZN \\
5127 & 0.00 & 0.02 & 0.00 & -0.05 & 8.31 & CAT \\
5159 & 0.00 & 0.02 & 0.00 & 0.10 & 8.26 & AAPL \\
5127 & 0.00 & 0.01 & 0.00 & -0.33 & 11.04 & MMM \\
5127 & 0.00 & 0.01 & 0.00 & 0.18 & 15.57 & WMT \\
5128 & 0.00 & 0.02 & 0.00 & 0.12 & 24.01 & CVX \\
5130 & 0.00 & 0.02 & 0.00 & -0.15 & 11.22 & INTC \\
5127 & 0.00 & 0.02 & 0.00 & 0.79 & 21.59 & GS \\
5127 & 0.00 & 0.02 & 0.00 & -0.00 & 13.52 & HD \\
5127 & 0.00 & 0.02 & 0.00 & -0.9