In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.weightstats import ztest
from scipy import stats

# 1. Perform a Z-test for comparing a sample mean to a known population mean
def perform_z_test(sample_data, population_mean):
    z_stat, p_value = ztest(sample_data, value=population_mean)
    return z_stat, p_value

# Sample data and known population mean
sample_data = [88, 92, 94, 94, 96, 97, 97, 97, 99, 99,
               105, 109, 109, 109, 110, 112, 112, 113, 114, 115]
population_mean = 100

z_statistic, p_value = perform_z_test(sample_data, population_mean)
print(f"Z-statistic: {z_statistic}, P-value: {p_value}")

# Interpretation of results
if p_value < 0.05:
    print("Reject the null hypothesis: Significant difference found.")
else:
    print("Fail to reject the null hypothesis: No significant difference found.")

# 2. Simulate random data and perform hypothesis testing
np.random.seed(0)
simulated_data = np.random.normal(loc=100, scale=15, size=30)
z_stat_simulated, p_value_simulated = perform_z_test(simulated_data, population_mean)
print(f"Simulated Z-statistic: {z_stat_simulated}, Simulated P-value: {p_value_simulated}")

# 3. One-sample Z-test using Python (already implemented above)

# 4. Two-tailed Z-test and visualize decision region
def visualize_two_tailed_z_test(sample_data):
    z_statistic, p_value = ztest(sample_data)
    alpha = 0.05
    critical_value = stats.norm.ppf(1 - alpha/2)

    plt.figure(figsize=(10,5))
    x = np.linspace(-4, 4, 1000)
    plt.plot(x, stats.norm.pdf(x), label='Standard Normal Distribution')
    plt.axvline(x=-critical_value, color='red', linestyle='--', label='Critical Value')
    plt.axvline(x=critical_value, color='red', linestyle='--')
    plt.axvline(x=z_statistic, color='green', linestyle='--', label='Z-statistic')

    plt.title('Two-tailed Z-test Decision Region')
    plt.xlabel('Z-score')
    plt.ylabel('Probability Density')
    plt.legend()
    plt.show()

visualize_two_tailed_z_test(sample_data)

# 5. Calculate Type I and Type II errors during hypothesis testing
def visualize_errors(alpha=0.05):
    # Type I error (alpha)
    x = np.linspace(-4, 4, 1000)
    y = stats.norm.pdf(x)

    plt.fill_between(x[x < -stats.norm.ppf(1 - alpha/2)], y[x < -stats.norm.ppf(1 - alpha/2)], alpha=0.5,
                     label='Type I Error Region (α)')
    plt.fill_between(x[x > stats.norm.ppf(1 - alpha/2)], y[x > stats.norm.ppf(1 - alpha/2)], alpha=0.5)

    # Type II error (beta)
    beta_x = np.linspace(-4 + critical_value + 1.5 * stats.norm.std(),
                         -critical_value + critical_value + stats.norm.std(),
                         num=1000)

    beta_y = stats.norm.pdf(beta_x)

    plt.fill_between(beta_x[beta_x < critical_value], beta_y[beta_x < critical_value], alpha=0.5,
                     label='Type II Error Region (β)')

    plt.plot(x,y)
    plt.title('Type I and Type II Errors Visualization')
    plt.xlabel('Z-score')
    plt.ylabel('Probability Density')
    plt.legend()
    plt.show()

visualize_errors()

# 6. Independent T-test and interpret results
def independent_t_test(data1, data2):
    t_statistic, p_value = stats.ttest_ind(data1, data2)
    return t_statistic, p_value

dataA = np.random.normal(100, 10, size=30)
dataB = np.random.normal(110, 10, size=30)

t_statistic_indep, p_value_indep = independent_t_test(dataA, dataB)
print(f"Independent T-test: T-statistic: {t_statistic_indep}, P-value: {p_value_indep}")

# Interpretation of results for T-test
if p_value_indep < 0.05:
    print("Reject the null hypothesis: Significant difference between groups.")
else:
    print("Fail to reject the null hypothesis: No significant difference between groups.")

# 7. Paired sample T-test and visualize comparison results
def paired_t_test(data_before, data_after):
    t_statistic_paired, p_value_paired = stats.ttest_rel(data_before, data_after)
    return t_statistic_paired, p_value_paired

before_fertilizer = np.random.normal(75 + np.random.rand(30), 5)
after_fertilizer = before_fertilizer + np.random.normal(8, 2)

t_statistic_paird_test_result = paired_t_test(before_fertilizer, after_fertilizer)
print(f"Paired T-test: T-statistic: {t_statistic_paird_test_result[0]}, P-value: {t_statistic_paird_test_result[1]}")

plt.figure(figsize=(10,5))
sns.boxplot(data=[before_fertilizer, after_fertilizer])
plt.xticks([0, 1], ['Before Fertilizer', 'After Fertilizer'])
plt.title('Comparison of Plant Heights Before and After Fertilizer')
plt.ylabel('Height')
plt.show()

# 8. Simulate data and perform both Z-test and T-test then compare results
sample_size = 30
data_a = np.random.normal(loc=100, scale=15, size=sample_size)
data_b = np.random.normal(loc=110, scale=15, size=sample_size)

z_stat_a_b = perform_z_test(data_a.tolist(), population_mean)[0]
t_stat_a_b = independent_t_test(data_a.tolist(), data_b.tolist())[0]

print(f"Z-statistic for A vs B: {z_stat_a_b}, T-statistic for A vs B: {t_stat_a_b}")

# 9. Calculate confidence interval for a sample mean
def confidence_interval(data):
    mean = np.mean(data)
    sem = stats.sem(data) # Standard Error of Mean
    ci_range = stats.t.ppf(0.975,len(data)-1) * sem # For a two-tailed test at alpha=0.05
    return mean - ci_range , mean + ci_range

ci_lower , ci_upper = confidence_interval(sample_data)
print(f"Confidence interval for sample mean: ({ci_lower}, {ci_upper})")

# Significance of Confidence Interval
print("The confidence interval provides a range of values that likely contain the true population mean.")

# 10. Calculate the margin of error for a given confidence level using sample data
def margin_of_error(data):
    sem = stats.sem(data) # Standard Error of Mean
    margin_error = stats.t.ppf(0.975,len(data)-1) * sem # For a two-tailed test at alpha=0.05
    return margin_error

margin_error_sample_data = margin_of_error(sample_data)
print(f"Margin of Error for sample data: {margin_error_sample_data}")

# 11. Implement Bayes' Theorem in Python and explain the process
def bayes_theorem(prior_A_given_B: float , prior_B: float , likelihood_A: float):
   posterior_A_given_B = (likelihood_A * prior_A_given_B) / prior_B
   return posterior_A_given_B

prior_A_given_B = .8 # Probability of A given B occurs
prior_B = .6 # Probability of B occurring
likelihood_A = .9 # Probability of A occurring

posterior_probability_A_given_B = bayes_theorem(prior_A_given_B , prior_B , likelihood_A )
print(f"Posterior Probability of A given B: {posterior_probability_A_given_B}")

# Explanation of Bayes' Theorem Process:
# Bayes' theorem allows us to update our beliefs based on new evidence.
# It combines our prior belief (prior probability) with new evidence (likelihood) to form a new belief (posterior probability).

# 12. Perform a Chi-square test for independence between two categorical variables in Python
def chi_square_test(observed):
   chi2_stat , p_val , dof , expected_freqs = stats.chi2_contingency(observed)
   return chi2_stat , p_val , dof , expected_freqs

observed_data = np.array([[10 ,20] ,[20 ,25]])
chi2_stat , p_val , dof , expected_freqs = chi_square_test(observed_data)

print(f"Chi-square Statistic: {chi2_stat}, P-value: {p_val}, Degrees of Freedom: {dof}")
print(f"Expected Frequencies:\n{expected_freqs}")

# Interpretation of Chi-square Test:
if p_val < .05:
   print("Reject the null hypothesis: There is a significant association between the variables.")
else:
   print("Fail to reject the null hypothesis: No significant association between the variables.")

#13. Calculate expected frequencies for Chi-square test based on observed data.
def calculate_expected_frequencies(observed):
   chi2_stat , p_val , dof , expected_freqs = stats.chi2_contingency(observed)
   return expected_freqs

expected_freqs_calculated = calculate_expected_frequencies(observed_data)
print(f"Calculated Expected Frequencies:\n{expected_freqs_calculated}")

#14. Perform goodness-of-fit test using Python to compare observed data to an expected distribution.
def goodness_of_fit(observed_values , expected_values):
   chi2_stat , p_val = stats.chisquare(observed_values , expected_values)
   return chi2_stat , p_val

observed_values_gof = [50 ,30]
expected_values_gof =[40 ,40]

chi2_gof , p_val_gof = goodness_of_fit(observed_values_gof , expected_values_gof )
print(f"Goodness-of-Fit Chi-square Statistic: {chi2_gof}, P-value: {p_val_gof}")

if p_val_gof < .05:
   print("Reject the null hypothesis: Observed distribution differs from expected.")
else:
   print("Fail to reject the null hypothesis: No significant difference.")

#15. Simulate and visualize Chi-square distribution.
def plot_chi_square_distribution(df):
   x_chi_square=np.linspace(0 ,50 ,1000)
   y_chi_square=stats.chi2.pdf(x_chi_square , df)

   plt.plot(x_chi_square,y_chi_square,label=f'Chi-square Distribution df={df}')
   plt.title('Chi-square Distribution')
   plt.xlabel('Value')
   plt.ylabel('Probability Density')
   plt.legend()
   plt.show()

plot_chi_square_distribution(df=3)

#16. Implement F-test using Python to compare variances of two random samples.
def f_test(data1,data2):
   f_statistic,p_val_f=stats.levene(data1,data2)
   return f_statistic,p_val_f

data_var_1=np.random.normal(loc=100,std=10,size=30)
data_var_2=np.random.normal(loc=110,std=15,size=30)

f_stat,p_val_f=f_test(data_var_1,data_var_2)
print(f"F-statistic for variance comparison: {f_stat}, P-value: {p_val_f}")

if p_val_f < .05:
   print("Reject the null hypothesis: Significant difference in variances.")
else:
   print("Fail to reject the null hypothesis: No significant difference in variances.")

#17. Perform ANOVA test to compare means between multiple groups.
def anova_test(groups):
   f_stat,p_val_anova=stats.f_oneway(*groups)
   return f_stat,p_val_anova

group_1=np.random.normal(loc=100,std=10,size=30)
group_2=np.random.normal(loc=110,std=10,size=30)
group_3=np.random.normal(loc=120,std=10,size=30)

anova_result=f"{anova_test([group_1 , group_2 , group_3])}"
print(f"ANOVA result (F-statistic and P-value): {anova_result}")

if anova_result[1] < .05:
   print("Reject the null hypothesis: At least one group mean is significantly different.")
else:
   print("Fail to reject the null hypothesis: No significant difference in group means.")

#18. Perform one-way ANOVA test using Python and plot results.
def plot_anova_results(groups):
   f_stat,p_val_anova=anova_test(groups)

   sns.boxplot(data=[groups[0], groups[1], groups[2]], palette="Set3")
   plt.title('One-way ANOVA Results')
   plt.ylabel('Values')
   plt.xticks([0 ,1 ,2] ,[ 'Group A' ,'Group B' ,'Group C'])

   if p_val_anova < .05:
       significance_text="Significant Difference"
       color="red"

       print(significance_text)

       # Add significance level annotation on plot if needed.
       y_max=max([max(g) for g in groups])
       y_min=min([min(g) for g in groups])
       y_pos=y_max+(y_max-y_min)*0.05

       x_pos=[-0.15,-0.15]
       x_pos=[-0.15,-0.15]

       y_pos=[y_pos,y_pos]

       plt.plot(x_pos,y_pos,'k-',color=color,lw=.75)

       x_text=[-0.35,-0.35]
       y_text=[y_pos[0]+5,y_pos[1]+5]

       for i in range(len(y_text)):
           plt.text(x_text[i],y_text[i],significance_text,color=color)

       plt.show()

plot_anova_results([group_1,grouop_2,grouop_3])


# 19. Check assumptions (normality, independence, and equal variance) for ANOVA
def check_assumptions(groups):
    # Check normality using Shapiro-Wilk test
    normality_results = {}
    for i, group in enumerate(groups):
        stat, pval = stats.shapiro(group)
        normality_results[f'Group {i+1}'] = pval > 0.05  # True if normality holds

    # Check equal variances using Levene's test
    stat, pval_var = stats.levene(*groups)

    return normality_results, pval_var > 0.05

assumptions_results = check_assumptions([group_1, group_2, group_3])
print("Normality Results:", assumptions_results[0])
print("Equal Variance Test (Levene's test):", assumptions_results[1])

# 20. Perform a two-way ANOVA test using Python to study the interaction between two factors
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Simulated data for two-way ANOVA
np.random.seed(0)
factor_A = np.random.choice(['A1', 'A2'], size=60)
factor_B = np.random.choice(['B1', 'B2'], size=60)
response = np.random.normal(loc=20, scale=5, size=60) + (factor_A == 'A2') * 5 + (factor_B == 'B2') * 3

df_two_way = pd.DataFrame({'Factor_A': factor_A, 'Factor_B': factor_B, 'Response': response})

model_two_way = ols('Response ~ C(Factor_A) * C(Factor_B)', data=df_two_way).fit()
anova_table_two_way = sm.stats.anova_lm(model_two_way, typ=2)
print(anova_table_two_way)

# 21. Visualize the results of the two-way ANOVA
sns.interactionplot(x='Factor_A', y='Response', hue='Factor_B', data=df_two_way,
                    markers=['D', 'o'], linestyles=['-', '--'])
plt.title('Interaction Plot for Two-Way ANOVA')
plt.ylabel('Response')
plt.show()

# 22. Visualize the F-distribution
def plot_f_distribution(df1, df2):
    x_f = np.linspace(0, 5, 1000)
    y_f = stats.f.pdf(x_f, df1, df2)

    plt.plot(x_f, y_f, label=f'F-distribution (df1={df1}, df2={df2})')
    plt.title('F-Distribution')
    plt.xlabel('F-value')
    plt.ylabel('Probability Density')
    plt.axvline(x=stats.f.ppf(0.95, df1, df2), color='red', linestyle='--', label='Critical Value (95% CI)')
    plt.legend()
    plt.show()

plot_f_distribution(df1=3, df2=30)

# 23. Perform a one-way ANOVA test in Python and visualize results with boxplots to compare group means
def one_way_anova_and_plot(groups):
    f_statistic, p_value = stats.f_oneway(*groups)

    print(f"One-Way ANOVA: F-statistic: {f_statistic}, P-value: {p_value}")

    sns.boxplot(data=[groups[0], groups[1], groups[2]], palette="Set3")
    plt.title('One-Way ANOVA Results')
    plt.ylabel('Values')
    plt.xticks([0, 1, 2], ['Group A', 'Group B', 'Group C'])
    plt.show()

one_way_anova_and_plot([group_1, group_2, group_3])

# 24. Simulate random data from a normal distribution and perform hypothesis testing to evaluate the means
simulated_data_a = np.random.normal(loc=100, scale=10, size=50)
simulated_data_b = np.random.normal(loc=105, scale=10, size=50)

# Perform Z-test and T-test
z_stat_simulated_test = perform_z_test(simulated_data_a.tolist(), population_mean)[0]
t_stat_simulated_test = independent_t_test(simulated_data_a.tolist(), simulated_data_b.tolist())[0]

print(f"Simulated Z-statistic: {z_stat_simulated_test}, Simulated T-statistic: {t_stat_simulated_test}")

# 25. Perform a hypothesis test for population variance using a Chi-square distribution and interpret results
def chi_square_variance_test(sample_data):
    n = len(sample_data)
    sample_variance = np.var(sample_data, ddof=1)  # Sample variance
    chi_square_statistic = (n - 1) * sample_variance / population_variance  # Assuming known population variance
    p_value_chi_square = stats.chi2.sf(chi_square_statistic, n - 1)  # Right tail

    return chi_square_statistic, p_value_chi_square

population_variance = 100  # Known population variance for testing
chi_square_statistic_result, p_value_chi_square_result = chi_square_variance_test(sample_data)
print(f"Chi-square Statistic: {chi_square_statistic_result}, P-value: {p_value_chi_square_result}")

if p_value_chi_square_result < .05:
    print("Reject the null hypothesis: Significant difference in population variance.")
else:
    print("Fail to reject the null hypothesis: No significant difference in population variance.")

# 26. Perform a Z-test for comparing proportions between two datasets or groups
def z_test_proportions(success_a, total_a, success_b, total_b):
    proportion_a = success_a / total_a
    proportion_b = success_b / total_b

    pooled_proportion = (success_a + success_b) / (total_a + total_b)

    z_stat_proportion = (proportion_a - proportion_b) / np.sqrt(pooled_proportion * (1 - pooled_proportion) * (1/total_a + 1/total_b))

    return z_stat_proportion

success_a = 40
total_a = 100
success_b = 30
total_b = 100

z_stat_proportions_result = z_test_proportions(success_a, total_a, success_b, total_b)
print(f"Z-statistic for Proportions: {z_stat_proportions_result}")

# 27. Implement an F-test for comparing the variances of two datasets and interpret results
def f_test_variances(data1, data2):
   f_statistic,p_val_f=stats.levene(data1,data2)
   return f_statistic,p_val_f

data_var_1=np.random.normal(loc=100,std=10,size=30)
data_var_2=np.random.normal(loc=110,std=15,size=30)

f_stat,p_val_f=f_test_variances(data_var_1,data_var_2)
print(f"F-statistic for variance comparison: {f_stat}, P-value: {p_val_f}")

if p_val_f < .05:
   print("Reject the null hypothesis: Significant difference in variances.")
else:
   print("Fail to reject the null hypothesis: No significant difference in variances.")

# 28. Perform a Chi-square test for goodness of fit with simulated data and analyze results
def chi_square_goodness_of_fit(observed_values , expected_values):
   chi2_stat , p_val_goodness_of_fit = stats.chisquare(observed_values , expected_values)
   return chi2_stat , p_val_goodness_of_fit

observed_values_gof = [50 ,30]
expected_values_gof =[40 ,40]

chi2_gof , p_val_gof = chi_square_goodness_of_fit(observed_values_gof , expected_values_gof )
print(f"Goodness-of-Fit Chi-square Statistic: {chi2_gof}, P-value: {p_val_gof}")

if p_val_gof < .05:
   print("Reject the null hypothesis: Observed distribution differs from expected.")
else:
   print("Fail to reject the null hypothesis: No significant difference.")

