# CEO Compensation & OLS Regression
This notebook analyzes CEO salary data and explores relationships with firm characteristics.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.mstats import winsorize
import statsmodels.api as sm
import os

# Create output directories if they do not exist
os.makedirs('figures', exist_ok=True)
os.makedirs('report', exist_ok=True)


In [None]:
# Load and clean datasets
salaries = pd.read_csv('salaries_2025_s1.csv')
companies = pd.read_csv('companies_2025_s1.csv')

# Drop rows with missing key values
salaries = salaries.dropna(subset=['CompanyID', 'CEO_Salary'])
companies = companies.dropna(subset=['CompanyID', 'Firm_Size', 'Profitability', 'Industry'])

# Merge datasets on CompanyID
df = pd.merge(salaries, companies, on='CompanyID', how='inner')


In [None]:
# Summary statistics table
summary_stats = df[['CEO_Salary', 'Firm_Size', 'Profitability']].describe()
summary_stats


In [None]:
# Histogram of CEO salaries
plt.figure(figsize=(8,6))
sns.histplot(df['CEO_Salary'], bins=20, kde=True)
plt.title('Distribution of CEO Salaries')
plt.xlabel('CEO Salary')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('figures/ceo_salary_hist.png')
plt.close()


In [None]:
# Pie chart of industry distribution
plt.figure(figsize=(8,8))
df['Industry'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90)
plt.ylabel('')
plt.title('Industry Distribution')
plt.tight_layout()
plt.savefig('figures/industry_distribution_pie.png')
plt.close()


In [None]:
# Scatter plot of Firm Size vs CEO Salary
plt.figure(figsize=(8,6))
sns.scatterplot(x='Firm_Size', y='CEO_Salary', data=df)
plt.title('Firm Size vs CEO Salary')
plt.xlabel('Firm Size')
plt.ylabel('CEO Salary')
plt.tight_layout()
plt.savefig('figures/firm_size_vs_ceo_salary.png')
plt.close()


In [None]:
# Winsorize CEO salary at 1% tails
df['CEO_Salary_W'] = winsorize(df['CEO_Salary'], limits=[0.01, 0.01])

# Compare summary stats before and after winsorization
pre_stats = df['CEO_Salary'].describe()
post_stats = df['CEO_Salary_W'].describe()
comparison = pd.DataFrame({'Before': pre_stats, 'After': post_stats})
comparison


In [None]:
# OLS regression with winsorized salary
X = df[['Firm_Size', 'Profitability']]
X = sm.add_constant(X)
y = df['CEO_Salary_W']
model = sm.OLS(y, X).fit()
print(model.summary())

# Save regression summary
with open('report/OLS_regression_summary.txt', 'w') as fh:
    fh.write(model.summary().as_text())
