Significance level = 0.05; Factor = city; dependent var = Solar Zenith Angle (sza)

In [None]:
import pandas as pd
# Load your data
houston_df = pd.read_csv("/content/merged_climate_data_houston.csv")
la_df = pd.read_csv("/content/merged_climate_data_la.csv")
ny_df = pd.read_csv("/content/merged_climate_data_ny.csv")
miami_df = pd.read_csv("/content/merged_climate_data_miami.csv")
seattle_df = pd.read_csv("/content/merged_climate_data_seattle.csv")
tampa_df = pd.read_csv("/content/merged_climate_data_tampa.csv")


In [None]:
# Merge Solar Zenith Angle cols to a single df
houston_sza = houston_df["Solar Zenith Angle"]
houston_sza.name = "Houston"

la_sza = la_df["Solar Zenith Angle"]
la_sza.name = "LA"

ny_sza = ny_df["Solar Zenith Angle"]
ny_sza.name = "NY"

miami_sza = miami_df["Solar Zenith Angle"]
miami_sza.name = "Miami"

seattle_sza = seattle_df["Solar Zenith Angle"]
seattle_sza.name = "Seattle"

tampa_sza = tampa_df["Solar Zenith Angle"]
tampa_sza.name = "Tampa"

sza_df = pd.concat([houston_sza,
                     la_sza,
                     ny_sza,
                     miami_sza,
                     seattle_sza,
                     tampa_sza],
                     axis = 1)


print("Number of columns:", sza_df.shape[1])
print("\nCount per column:")
print(sza_df.count())

Number of columns: 6

Count per column:
Houston    27
LA         27
NY         27
Miami      27
Seattle    27
Tampa      27
dtype: int64


**Normality Test: Shapiro Wilk Test**

**Check if the distribution of the Solar Zenith Angle for each city is normal**

In [None]:
from scipy.stats import shapiro
# Drop the first column
# df2 = df.drop(df.columns[0], axis=1)

# Select numeric columns
numeric_cols = sza_df.select_dtypes(include='number').columns

# Run Shapiro-Wilk test
results = {}

for col in numeric_cols:
    stat, p = shapiro(sza_df[col])
    results[col] = {"W_statistic": stat, "p_value": p}

# Print results
for col, res in results.items():
    print(f"{col}: W={res['W_statistic']:.4f}, p={res['p_value']:.4g}")

Houston: W=0.5730, p=1.014e-07
LA: W=0.5700, p=9.364e-08
NY: W=0.5726, p=1.003e-07
Miami: W=0.5718, p=9.818e-08
Seattle: W=0.5729, p=1.011e-07
Tampa: W=0.5761, p=1.097e-07


**Test for Homogeneity: Levene Test (Non-parametric)**

In [None]:
import pandas as pd
from scipy.stats import levene

# df = pd.read_csv("/content/qbmult_dataset.csv")

# Split groups
# arms_male = df[df["SEX"] == 1]["ARMS"]
# arms_female = df[df["SEX"] == 2]["ARMS"]

# Levene test
stat, p = levene(sza_df['Houston'],
                 sza_df['LA'],
                 sza_df['Miami'],
                 sza_df['NY'],
                 sza_df['Seattle'],
                 sza_df['Tampa'])
print("Levene statistic:", stat)
print("p value:", p)

Levene statistic: 0.19212305762882317
p value: 0.9651958243736715


**Test for Homogeneity: Bartlett's Test (Parametric)** (Not used)

In [None]:
# from scipy.stats import bartlett

# stat, p = bartlett(sza_df['Houston'],
#                     sza_df['LA'],
#                     sza_df['Miami'],
#                     sza_df['NY'],
#                     sza_df['Seattle'],
#                     sza_df['Tampa'])
# print("Bartlett statistic:", stat)
# print("p value:", p)

**T-test** (Not used)

In [None]:

# import pandas as pd
# from scipy.stats import ttest_ind

# # Load your dataset
# df = pd.read_csv("/content/qbmult_dataset.csv")

# # Drop missing values if any
# df = df.dropna(subset=["ARMS", "SEX"])

# # Split data by gender
# arms_male = df[df["SEX"] == 1]["ARMS"]
# arms_female = df[df["SEX"] == 2]["ARMS"]

# # Run independent samples t-test
# t_stat, p_value = ttest_ind(arms_male, arms_female, equal_var=False)   # Welch version

# print("t statistic:", t_stat)
# print("p value:", p_value)


t statistic: 15.585022528601815
p value: 2.621630951194982e-40


**Mann-Whitney U test (Non-parametric)**

Null hypothesis (H₀): The distributions of both populations are identical.

Alternative hypothesis (H₁): The distributions of the two populations are not identical.

(Not used)

In [None]:

# from scipy.stats import mannwhitneyu

# # Split the data
# group1 = df[df["SEX"] == 1]["ARMS"].dropna()
# group2 = df[df["SEX"] == 2]["ARMS"].dropna()

# # Mann Whitney U test
# u_stat, p_value = mannwhitneyu(group1, group2, alternative="two-sided")

# print("U statistic:", u_stat)
# print("p value:", p_value)


In [None]:

import pandas as pd
from scipy.stats import f_oneway


In [None]:
# Not used
# df = pd.read_csv("/content/qbmult_dataset.csv")

# # Extract ARMS variable by IPAQ groups
# arms_0 = df[df["IPAQ"] == 0]["ARMS"]
# arms_1 = df[df["IPAQ"] == 1]["ARMS"]
# arms_2 = df[df["IPAQ"] == 2]["ARMS"]

# groups = df.groupby("IPAQ")

# print("\nNormality test by IPAQ:")
# for name, group in groups:
#     data = group["ARMS"].dropna()
#     n=len(data)
#     stat, p = shapiro(group["ARMS"])
#     print(f"IPAQ = {name}: n = {n}, W = {stat:.4f}, p = {p:.4g}")


**Kruskal-Wallis test**
Non-parametric equivalent of one way ANOVA

In [None]:
import pandas as pd
from scipy.stats import kruskal

# Load your data
# df = pd.read_csv("/content/qbmult_dataset.csv")

# Drop missing values
# df = df.dropna(subset=["ARMS", "IPAQ"])

# Split into groups
# group0 = df[df["IPAQ"] == 0]["ARMS"]
# group1 = df[df["IPAQ"] == 1]["ARMS"]
# group2 = df[df["IPAQ"] == 2]["ARMS"]

# Run Kruskal Wallis
stat, p = kruskal(sza_df['Houston'],
                    sza_df['LA'],
                    sza_df['Miami'],
                    sza_df['NY'],
                    sza_df['Seattle'],
                    sza_df['Tampa'])

print("Kruskal Wallis statistic:", stat)
print("p value:", p)

Kruskal Wallis statistic: 138.55957178670147
p value: 3.6208750862318413e-28


In [None]:

!pip install scikit-posthocs

Collecting scikit-posthocs
  Downloading scikit_posthocs-0.11.4-py3-none-any.whl.metadata (5.8 kB)
Downloading scikit_posthocs-0.11.4-py3-none-any.whl (33 kB)
Installing collected packages: scikit-posthocs
Successfully installed scikit-posthocs-0.11.4


In [None]:


import scikit_posthocs as sp

cols = [sza_df[col] for col in sza_df.columns]
sp.posthoc_dunn(cols, p_adjust="bonferroni")

Unnamed: 0,1,2,3,4,5,6
1,1.0,0.516699,6.914702e-06,0.03903217,3.349102e-09,1.0
2,0.5166989,1.0,0.05132489,4.439248e-06,0.0003511461,0.001973766
3,6.914702e-06,0.051325,1.0,1.211007e-14,1.0,2.209295e-10
4,0.03903217,4e-06,1.211007e-14,1.0,1.246692e-19,1.0
5,3.349102e-09,0.000351,1.0,1.246692e-19,1.0,1.211007e-14
6,1.0,0.001974,2.209295e-10,1.0,1.211007e-14,1.0


In [None]:
# Not used
# # -*- coding: utf-8 -*-
# """ANOVA

# Automatically generated by Colab.

# Original file is located at
#     https://colab.research.google.com/drive/1Tx1mQqVHxj0_Y63uhd1lbhGZKA3_3PMX
# """

# import pandas as pd
# import statsmodels.api as sm
# from statsmodels.formula.api import ols

# # Dataset
# IRON_FORM = pd.DataFrame({
#     "score": [20.5, 28.1, 27.8, 27.0, 28.0,
# 25.2, 25.3, 27.1, 20.5, 31.3,26.3, 24.0, 26.2, 20.2, 23.7,
# 34.0, 17.1, 26.8, 23.7, 24.9,29.5, 34.0, 27.5, 29.4, 27.9,
# 26.2, 29.9, 29.5, 30.0, 35.6,36.5, 44.2, 34.1, 30.3, 31.4,
# 33.1, 34.1, 32.9, 36.3, 25.5],
#     "group": ["Type 1"]*10 + ["Type 2"]*10 + ["Type 3"]*10+["Type 4"]*10
# })

# # Fit model
# model = ols('score ~ C(group)', data=IRON_FORM).fit()

# # ANOVA table
# anova_table = sm.stats.anova_lm(model, typ=2)
# print(anova_table)

# # Compute group means
# means = IRON_FORM.groupby('group')['score'].mean().reset_index()
# means = means.rename(columns={'score': 'mean'})

# print("\nGroup Means:")
# print(means)


# from statsmodels.stats.multicomp import pairwise_tukeyhsd
# tukey = pairwise_tukeyhsd(IRON_FORM['score'], IRON_FORM['group'])
# print(tukey)

# import pandas as pd
# from scipy.stats import shapiro


In [None]:
# Already done
# from scipy.stats import bartlett

# # # Bartlett test for equal variances across IPAQ groups
# # stat, p = bartlett(arms_0, arms_1, arms_2)

# # print("Bartlett statistic:", stat)
# # print("p value:", p)

# # # Levene test for equal variances
# # stat, p = levene(arms_0, arms_1, arms_2)

# # print("Levene statistic:", stat)
# # print("p value:", p)

# f_stat, p_val = f_oneway(sza_df['Houston'],
#                     sza_df['LA'],
#                     sza_df['Miami'],
#                     sza_df['NY'],
#                     sza_df['Seattle'],
#                     sza_df['Tampa'])

# print("F statistic:", f_stat)
# print("p value:", p_val)

In [None]:
# from statsmodels.stats.multicomp import pairwise_tukeyhsd

# # Reshape the DataFrame to a long format suitable for Tukey HSD
# df_melted = sza_df.melt(var_name='city', value_name='solar_zenith_angle')

# # Perform Tukey's test
# tukey = pairwise_tukeyhsd(endog=df_melted['solar_zenith_angle'],
#                           groups=df_melted['city'],
#                           alpha=0.05)

# print(tukey)

In [None]:

# Not used
# !pip install pingouin


**Welch ANOVA**
Not used

In [None]:

# import pingouin as pg

# # Welch ANOVA: ARMS by IPAQ
# welch = pg.welch_anova(dv="ARMS", between="IPAQ", data=df)

# print(welch)

In [None]:


# import pingouin as pg

# gh = pg.pairwise_gameshowell(dv="ARMS", between="IPAQ", data=df)
# print(gh)

Since a difference was shown, averaging is not possible here