In [None]:
# Import required libraries
import pandas as pd
import os

In [None]:
# Parameters and Settings
DATE_COL = 'Date'
ID_COL = 'PERMNO'
TARGET_COL = 'excess_return'

# File path for the cleaned and filtered data
current_directory = os.getcwd()
clean_filtered_data_file = os.path.join(current_directory, 'Data', 'clean_filtered_data.csv')

# File path to save descriptive statistics for excess return
overall_stats_path = os.path.join(current_directory, 'Results', 'overall_stats.csv')

# Estimation (in sample) period dates
in_sample_start_date = pd.to_datetime("2000-01-01")
in_sample_end_date = pd.to_datetime("2015-12-31")

# Out-of-sample period dates
out_sample_start_date = pd.to_datetime("2016-01-01")
out_sample_end_date = pd.to_datetime("2024-12-31")

### Step 1: Load Cleaned and Filtered Data

In [None]:
# Load the cleaned and filtered data files for in sample and out of sample periods into a pandas DataFrames (output from ETL step)
df = pd.read_csv(clean_filtered_data_file, index_col=0)

# Ensure the date columns are in datetime format
df[DATE_COL] = pd.to_datetime(df[DATE_COL])

print(df.info())

### Step 2: Compute Descriptive Statistics

In [None]:
# Check number of unique stocks
stocks_permno = df[ID_COL].unique().tolist()
print(f"Number of unique stocks: {len(stocks_permno)}")

In [None]:
# Create function to compute descriptive statistics for daily excess returns

def compute_descriptive_statistics(df, start_date, end_date, date_col=DATE_COL, column_name=TARGET_COL, stock_identifier=ID_COL, stock_name="SecurityNm", ticker="Ticker"):
    # Filter by period
    df_period = df[(df[date_col] >= start_date) & (df[date_col] <= end_date)]

    # Group by stock and compute stats:
    stats_list = []
    for permno, grp in df_period.groupby(stock_identifier):
        col = grp[column_name]
        sm = grp[stock_name].iloc[0]
        stats = {
            stock_identifier: permno,
            ticker: grp[ticker].iloc[0],
            stock_name: sm.split(";")[0],
            "mean": col.mean(),
            "median": col.median(),
            "std": col.std(),
            "min": col.min(),
            "max": col.max(),
            "skew": col.skew(),
            "kurtosis": col.kurtosis()
        }
        stats_list.append(stats)
    
    stats_df = pd.DataFrame(stats_list)
    
    return stats_df

In [None]:
# Calculate and print descriptive statistics for each stock during in-sample period
in_sample_stats = compute_descriptive_statistics(df, start_date=in_sample_start_date, end_date=in_sample_end_date)
print("Descriptive Statisctics for Estimation Period (In-Sample):")
in_sample_stats

In [None]:
# Identify stocks with outlier data (i.e., returns >= 100% or returns <= -100%)
print("Stock with outlier data:")
print(in_sample_stats[(in_sample_stats[max]>=1) | (in_sample_stats[min]<=-1)][ID_COL].to_list())

In [None]:
# Calculate and print descriptive statistics for each stock during out-of-sample period
out_of_sample_stats = compute_descriptive_statistics(df, start_date=out_sample_start_date, end_date=out_sample_end_date)
print("Descriptive Statisctics for Out of Sample Period:")
out_of_sample_stats

In [None]:
# Identify stocks with outlier data (i.e., returns >= 100% or returns <= -100%)
print("Stock with outlier data:")
print(out_of_sample_stats[(out_of_sample_stats[max]>=1) | (out_of_sample_stats[min]<=-1)][ID_COL].to_list())

In [None]:
# Calculate and print descriptive statistics for all stocks together during in-sample and out-of-sample period
overall_stats = []

df_in_sample = df[(df[DATE_COL] >= in_sample_start_date) & (df[DATE_COL] <= in_sample_end_date)]
df_out_of_sample = df[(df[DATE_COL] >= out_sample_start_date) & (df[DATE_COL] <= out_sample_end_date)]

overall_stats.append({"": "Mean", "In-Sample": df_in_sample[TARGET_COL].mean(), "Out-of-Sample": df_out_of_sample[TARGET_COL].mean()})
overall_stats.append({"": "Median", "In-Sample": df_in_sample[TARGET_COL].median(), "Out-of-Sample": df_out_of_sample[TARGET_COL].median()})
overall_stats.append({"": "Std", "In-Sample": df_in_sample[TARGET_COL].std(), "Out-of-Sample": df_out_of_sample[TARGET_COL].std()})
overall_stats.append({"": "Min", "In-Sample": df_in_sample[TARGET_COL].min(), "Out-of-Sample": df_out_of_sample[TARGET_COL].min()})
overall_stats.append({"": "Max", "In-Sample": df_in_sample[TARGET_COL].max(), "Out-of-Sample": df_out_of_sample[TARGET_COL].max()})
overall_stats.append({"": "Skew", "In-Sample": df_in_sample[TARGET_COL].skew(), "Out-of-Sample": df_out_of_sample[TARGET_COL].skew()})
overall_stats.append({"": "Kurtosis", "In-Sample": df_in_sample[TARGET_COL].kurtosis(), "Out-of-Sample": df_out_of_sample[TARGET_COL].kurtosis()})

overall_stats_df = pd.DataFrame(overall_stats)
overall_stats_df

##### Save Results

In [None]:
# Save descriptive statistics for excess return
overall_stats_df.to_csv(overall_stats_path, index=False)