### Hypothesis Testing

In [None]:
import pandas as pd
import scipy.stats as st
import numpy as np

In [None]:
# Load the cleaned datasets
df_income = pd.read_csv("../data/clean/income_statement_data.csv", sep=";", encoding="utf-8")
df_ma = pd.read_csv("../data/clean/mergers_acquisitions_data.csv", sep=";", encoding="utf-8")
df_profile = pd.read_csv("../data/clean/profile_data.csv", sep=";", encoding="utf-8")
df_stock = pd.read_csv("../data/clean/stock_data.csv", sep=";", encoding="utf-8")

# 🟢 Hypothesis Testing: ANOVA on Acquisition Prices for Large Acquisitions

* **H₀:** μ_BLK_Large = μ_BRK_Large = μ_GS_Large = μ_JPM_Large = μ_STT_Large  
  * (The mean acquisition price for large acquisitions is the same across all acquirers.)*

* **H₁:** At least one of μ_BLK_Large, μ_BRK_Large, μ_GS_Large, μ_JPM_Large, or μ_STT_Large is different  
  * (At least one acquirer has a significantly different mean acquisition price for large acquisitions.)*


In [None]:
alpha = 0.05

acquirers = df_ma['acquirer'].unique()

prices_dict = {acq: df_ma[(df_ma["acquirer"] == acq) & 
                          (df_ma["acquired_company_size"] == 'Large')]["acquisition_price_usd_billions"].dropna()
               for acq in acquirers}

f_stat, p_value = st.f_oneway(*prices_dict.values())

results_df = pd.DataFrame({
    'Acquirer': acquirers ,
    'Count': [len(prices) for prices in prices_dict.values()],
    'Mean in Billions (USD)': [prices.mean().round(2) for prices in prices_dict.values()],
    'p_value':p_value,
    'Conclusion': 'Fail to Reject H0 (no significant difference)' if p_value > alpha else 'Reject H0 (significant difference)'
})

display(results_df)

# 🟢 Hypothesis Testing: ANOVA on Acquisition Prices Across Acquirers

* **H₀:** μ_BLK = μ_BRK = μ_GS = μ_JPM = μ_STT  
  * (The mean acquisition price is the same for all acquirers in a given industry.)*

* **H₁:** μ_BLK ≠ μ_BRK ≠ μ_GS ≠ μ_JPM ≠ μ_STT  
  * (At least one acquirer has a significantly different mean acquisition price in a given industry.)*


In [None]:
# Get industries that are shared by more than one acquirer
industry_counts = df_ma.groupby("industry")["acquirer"].nunique()
shared_industries = industry_counts[industry_counts > 1].index 

anova_results = []

# Loop through each shared industry and perform calculations
for industry in shared_industries:
    industry_data = df_ma[df_ma["industry"] == industry]
    # Group acquisition prices by acquirer for ANOVA test
    acquirer_groups = [group["acquisition_price_usd_billions"]
                       for _, group in industry_data.groupby("acquirer")]
    
    # Calculate overall count and mean acquisition price for the industry
    count = len(industry_data["acquisition_price_usd_billions"])
    mean_price = industry_data["acquisition_price_usd_billions"].mean().round(2)
    
    # Only run ANOVA if there are at least two groups with more than one data point
    if len(acquirer_groups) > 1 and all(len(g) > 1 for g in acquirer_groups):
        _, p_value = st.f_oneway(*acquirer_groups)
        conclusion = "Reject H0 (significant difference)" if p_value < alpha else "Fail to reject H0 (no significant difference)"
    else:
        p_value, conclusion = "Not enough data", "Not enough data"
    
    # Append results without f-statistic
    anova_results.append({
        "Industry": industry,
        "Count": count,
        "Mean in Billions (USD)": mean_price,
        "p_value": p_value,
        "Conclusion": conclusion
    })

# Convert results to DataFrame and drop rows with not enough data (optional)
df_anova_results = pd.DataFrame(anova_results).dropna(subset=["p_value"])
df_anova_results = df_anova_results[df_anova_results['p_value'] != 'Not enough data']

# Display the DataFrame
display(df_anova_results)

# 🟢 Hypothesis Testing: Chi2 Test on Acquisition of Mature Companies by BlackRock (Chi2 Test)

* **H₀:**  μ_BLK_matured = μ_BRK_matured = μ_GS_matured = μ_JPM_matured = μ_STT_matured

    * The proportion of mature companies acquired by BlackRock is the same as the expected proportion.

* **H₁:** μ_BLK_not_matured ≠ μ_BRK_not_matured ≠ μ_GS_not_matured ≠ μ_JPM_not_matured ≠ μ_STT_not_matured 

    * The proportion of mature companies acquired by BlackRock is higher than the expected proportion.


In [None]:
chi2_results = []

# Get list of competitors (excluding BlackRock)
competitors = df_ma["acquirer"].unique()
competitors = [comp for comp in competitors if comp != "BlackRock"]

# Loop through each competitor to perform Chi-Square test
for competitor in competitors:
    blackrock_data = df_ma[df_ma["acquirer"] == "BlackRock"]["matured"].value_counts()
    competitor_data = df_ma[df_ma["acquirer"] == competitor]["matured"].value_counts()

    # Create contingency table
    contingency_table = pd.DataFrame({
        "BlackRock": blackrock_data,
        competitor: competitor_data
    }).fillna(0).astype(int).T.values 

    if contingency_table.shape == (2, 2):
        chi2_stat, p_value, dof, expected = st.chi2_contingency(contingency_table)

        # Determine hypothesis conclusion
        conclusion = "Reject H0" if p_value < alpha else "Fail to reject H0"

        # Store results
        chi2_results.append({
            "Competitor": competitor,
            "p_value": p_value,
            "Conclusion": conclusion,
        })
    else:
        chi2_results.append({
            "Competitor": competitor,
            "p_value": "Not enough data",
            "Conclusion": "Not enough data"
        })

df_chi2_results = pd.DataFrame(chi2_results).dropna()
display(df_chi2_results)

# 🟢 Hypothesis Testing: Industry-Level T-Test on Acquisition Prices

* **H₀:** μ_BLK_Industry >= μ_BRK_Industry  
  *(The mean acquisition price in a given industry is the more for BlackRock than for Berkshire Hathaway.)*

* **H₁:** μ_BLK_Industry < μ_BRK_Industry  
  *(BlackRock acquires companies at a lower price than Berkshire Hathaway in a given industry.)*

In [None]:
# Store results in a list
t_test_results = []

# Identify shared industries where both BlackRock and Berkshire Hathaway made acquisitions
blk_industries = set(df_ma[df_ma['acquirer'] == 'BlackRock']['industry'].dropna().unique())
brk_industries = set(df_ma[df_ma['acquirer'] == 'Berkshire Hathaway']['industry'].dropna().unique())

shared_industries = blk_industries.intersection(brk_industries)

# Filter dataset to include only shared industries
df_shared = df_ma[df_ma['industry'].isin(shared_industries)]

# Loop through each shared industry and perform a T-Test
for industry in shared_industries:
    blk_prices = df_shared[(df_shared["acquirer"] == "BlackRock") & (df_shared["industry"] == industry)]["acquisition_price_usd_billions"].dropna()
    brk_prices = df_shared[(df_shared["acquirer"] == "Berkshire Hathaway") & (df_shared["industry"] == industry)]["acquisition_price_usd_billions"].dropna()

    if len(blk_prices) > 1 and len(brk_prices) > 1:
        t_stat, p_value = st.ttest_ind(blk_prices, brk_prices, equal_var=False, alternative="less")

        conclusion = "Reject H0" if p_value < alpha else "Fail to reject H0"

        # Store results
        t_test_results.append({
            "Industry": industry,
            'BLK Mean Price': blk_prices.mean().round(2),
            'BRK Mean Price': brk_prices.mean().round(2),
            "p_value": p_value,
            "Conclusion": conclusion
        })
    else:
        t_test_results.append({
            "Industry": industry,
            "p_value": "Not enough data",
            "Conclusion": "Not enough data"
        })

# Convert results to DataFrame and display
df_t_test_results = pd.DataFrame(t_test_results).dropna()

display(df_t_test_results)