### Hypothesis Testing

In [4]:
import pandas as pd
import scipy.stats as st
import numpy as np

In [5]:
# Load the cleaned datasets
df_income = pd.read_csv("../data/clean/income_statement_data.csv", sep=";", encoding="utf-8")
df_ma = pd.read_csv("../data/clean/mergers_acquisitions_data.csv", sep=";", encoding="utf-8")
df_profile = pd.read_csv("../data/clean/profile_data.csv", sep=";", encoding="utf-8")
df_stock = pd.read_csv("../data/clean/stock_data.csv", sep=";", encoding="utf-8")

# üü¢ Hypothesis Testing: ANOVA on Acquisition Prices for Large Acquisitions

* **H‚ÇÄ:** Œº_BLK_Large = Œº_BRK_Large = Œº_GS_Large = Œº_JPM_Large = Œº_STT_Large  
  * (The mean acquisition price for large acquisitions is the same across all acquirers.)*

* **H‚ÇÅ:** At least one of Œº_BLK_Large, Œº_BRK_Large, Œº_GS_Large, Œº_JPM_Large, or Œº_STT_Large is different  
  * (At least one acquirer has a significantly different mean acquisition price for large acquisitions.)*


In [8]:
alpha = 0.05

blk_prices = df_ma[(df_ma["acquirer"] == "BlackRock") & (df_ma["acquired_company_size"] == 'Large')]["acquisition_price_usd"].dropna()
brk_prices = df_ma[(df_ma["acquirer"] == "Berkshire Hathaway") & (df_ma["acquired_company_size"] == 'Large')]["acquisition_price_usd"].dropna()
gs_prices = df_ma[(df_ma["acquirer"] == "Goldman Sachs") & (df_ma["acquired_company_size"] == 'Large')]["acquisition_price_usd"].dropna()
jpm_prices = df_ma[(df_ma["acquirer"] == "JPMorgan Chase") & (df_ma["acquired_company_size"] == 'Large')]["acquisition_price_usd"].dropna()
stt_prices = df_ma[(df_ma["acquirer"] == "State Street") & (df_ma["acquired_company_size"] == 'Large')]["acquisition_price_usd"].dropna()

f_stat, p_value = st.f_oneway(blk_prices,brk_prices,gs_prices,jpm_prices,stt_prices)

if p_value > alpha:
    print(f"Fail to reject the null hypothesis (H0). There is NO significant difference in acquisition prices for large acquisitions between BlackRock and its competitors (p_value: {p_value}).")
else:
    print(f"Reject null hypothesis (H0). There IS significant difference in acquisition prices for large acquisitions between BlackRock and its competitors (p_value: {p_value}).")

Fail to reject the null hypothesis (H0). There is NO significant difference in acquisition prices for large acquisitions between BlackRock and its competitors (p_value: 0.33646991426786765).


# üü¢ Hypothesis Testing: ANOVA on Acquisition Prices Across Acquirers

* **H‚ÇÄ:** Œº_BLK = Œº_BRK = Œº_GS = Œº_JPM = Œº_STT  
  * (The mean acquisition price is the same for all acquirers in a given industry.)*

* **H‚ÇÅ:** Œº_BLK ‚â† Œº_BRK ‚â† Œº_GS ‚â† Œº_JPM ‚â† Œº_STT  
  * (At least one acquirer has a significantly different mean acquisition price in a given industry.)*


In [10]:
industry_counts = df_ma.groupby("industry")["acquirer"].nunique()
shared_industries = industry_counts[industry_counts > 1].index 

# Store results in a dataframe
anova_results = []

# Perform ANOVA test for each shared industry
for industry in shared_industries:
    industry_data = df_ma[df_ma["industry"] == industry]
    acquirer_groups = [group["acquisition_price_usd"].dropna() for _, group in industry_data.groupby("acquirer")]

    # Only run ANOVA if there are at least two groups with more than one data point
    if len(acquirer_groups) > 1 and all(len(g) > 1 for g in acquirer_groups):
        f_stat, p_value = st.f_oneway(*acquirer_groups)
        conclusion = "Reject H0" if p_value < 0.05 else "Fail to reject H0"
    else:
        f_stat, p_value, conclusion = None, "Not enough data", "Not enough data"

    # Append results
    anova_results.append({"Industry": industry, "F-Statistic": f_stat, "P-Value": p_value, "Conclusion": conclusion})

# Convert results to DataFrame
df_anova_results = pd.DataFrame(anova_results).dropna()

display(df_anova_results)

Unnamed: 0,Industry,F-Statistic,P-Value,Conclusion
1,Energy,14.678975,0.06187,Fail to reject H0
4,Investment & Financial Services,1.50754,0.307366,Fail to reject H0
5,Manufacturing,0.817671,0.432517,Fail to reject H0


# üü¢ Hypothesis Testing: Chi2 Test on Acquisition of Mature Companies by BlackRock (Chi2 Test)

* **H‚ÇÄ:**  Œº_BLK_matured = Œº_BRK_matured = Œº_GS_matured = Œº_JPM_matured = Œº_STT_matured

    * The proportion of mature companies acquired by BlackRock is the same as the expected proportion.

* **H‚ÇÅ:** Œº_BLK_not_matured ‚â† Œº_BRK_not_matured ‚â† Œº_GS_not_matured ‚â† Œº_JPM_not_matured ‚â† Œº_STT_not_matured 

    * The proportion of mature companies acquired by BlackRock is higher than the expected proportion.


In [12]:
chi2_results = []

# Get list of competitors (excluding BlackRock)
competitors = df_ma["acquirer"].unique()
competitors = [comp for comp in competitors if comp != "BlackRock"]

# Loop through each competitor to perform Chi-Square test
for competitor in competitors:
    blackrock_data = df_ma[df_ma["acquirer"] == "BlackRock"]["matured"].value_counts()
    competitor_data = df_ma[df_ma["acquirer"] == competitor]["matured"].value_counts()

    # Create contingency table
    contingency_table = pd.DataFrame({
        "BlackRock": blackrock_data,
        competitor: competitor_data
    }).fillna(0).astype(int).T.values 

    if contingency_table.shape == (2, 2):
        chi2_stat, p_value, dof, expected = st.chi2_contingency(contingency_table)

        # Determine hypothesis conclusion
        conclusion = "Reject H0" if p_value < alpha else "Fail to reject H0"

        # Store results
        chi2_results.append({
            "Competitor": competitor,
            "Chi2 Statistic": chi2_stat,
            "P-Value": p_value,
            "Conclusion": conclusion
        })
    else:
        chi2_results.append({
            "Competitor": competitor,
            "Chi2 Statistic": None,
            "P-Value": "Not enough data",
            "Conclusion": "Not enough data"
        })

df_chi2_results = pd.DataFrame(chi2_results).dropna()
display(df_chi2_results)

Unnamed: 0,Competitor,Chi2 Statistic,P-Value,Conclusion
0,Goldman Sachs,0.0,1.0,Fail to reject H0
1,Berkshire Hathaway,0.605021,0.436669,Fail to reject H0
2,JPMorgan Chase,0.0,1.0,Fail to reject H0
3,State Street,0.0,1.0,Fail to reject H0


# üü¢ Hypothesis Testing: Industry-Level T-Test on Acquisition Prices

* **H‚ÇÄ:** Œº_BLK_Industry = Œº_BRK_Industry  
  *(The mean acquisition price in a given industry is the same for BlackRock and Berkshire Hathaway.)*

* **H‚ÇÅ:** Œº_BLK_Industry < Œº_BRK_Industry  
  *(BlackRock acquires companies at a lower price than Berkshire Hathaway in a given industry.)*

In [14]:
# Store results in a list
t_test_results = []

# Identify shared industries where both BlackRock and Berkshire Hathaway made acquisitions
blk_industries = set(df_ma[df_ma['acquirer'] == 'BlackRock']['industry'].dropna().unique())
brk_industries = set(df_ma[df_ma['acquirer'] == 'Berkshire Hathaway']['industry'].dropna().unique())

shared_industries = blk_industries.intersection(brk_industries)

# Filter dataset to include only shared industries
df_shared = df_ma[df_ma['industry'].isin(shared_industries)]

# Loop through each shared industry and perform a T-Test
for industry in shared_industries:
    blk_prices = df_shared[(df_shared["acquirer"] == "BlackRock") & (df_shared["industry"] == industry)]["acquisition_price_usd"].dropna()
    brk_prices = df_shared[(df_shared["acquirer"] == "Berkshire Hathaway") & (df_shared["industry"] == industry)]["acquisition_price_usd"].dropna()

    if len(blk_prices) > 1 and len(brk_prices) > 1:
        t_stat, p_value = st.ttest_ind(blk_prices, brk_prices, equal_var=False, alternative="less")

        conclusion = "Reject H0" if p_value < alpha else "Fail to reject H0"

        # Store results
        t_test_results.append({
            "Industry": industry,
            "T-Statistic": t_stat,
            "P-Value": p_value,
            "Conclusion": conclusion
        })
    else:
        t_test_results.append({
            "Industry": industry,
            "T-Statistic": None,
            "P-Value": "Not enough data",
            "Conclusion": "Not enough data"
        })

# Convert results to DataFrame and display
df_t_test_results = pd.DataFrame(t_test_results).dropna()

display(df_t_test_results)

Unnamed: 0,Industry,T-Statistic,P-Value,Conclusion
0,Energy,-3.831315,0.081088,Fail to reject H0
3,Technology & Software,-3.408245,0.085486,Fail to reject H0
