Significance level = 0.05; Factor = city; dependent var =
Surface Albedo


In [2]:
import pandas as pd

# Load your data
houston_df = pd.read_csv("/content/merged_climate_data_houston.csv")
la_df = pd.read_csv("/content/merged_climate_data_la.csv")
ny_df = pd.read_csv("/content/merged_climate_data_ny.csv")
miami_df = pd.read_csv("/content/merged_climate_data_miami.csv")
seattle_df = pd.read_csv("/content/merged_climate_data_seattle.csv")
tampa_df = pd.read_csv("/content/merged_climate_data_tampa.csv")


In [4]:
# Merge Surface Albedo cols to a single df
houston_surf_albedo = houston_df["Surface Albedo"]
houston_surf_albedo.name = "Surface_Albedo_Houston"

la_surf_albedo = la_df["Surface Albedo"]
la_surf_albedo.name = "Surface_Albedo_LA"

ny_surf_albedo = ny_df["Surface Albedo"]
ny_surf_albedo.name = "Surface_Albedo_NY"

miami_surf_albedo = miami_df["Surface Albedo"]
miami_surf_albedo.name = "Surface_Albedo_Miami"

seattle_surf_albedo = seattle_df["Surface Albedo"]
seattle_surf_albedo.name = "Surface_Albedo_Seattle"

tampa_surf_albedo = tampa_df["Surface Albedo"]
tampa_surf_albedo.name = "Surface_Albedo_Tampa"

surf_albedo_df = pd.concat([houston_surf_albedo,
                     la_surf_albedo,
                     ny_surf_albedo,
                     miami_surf_albedo,
                     seattle_surf_albedo,
                     tampa_surf_albedo],
                     axis = 1)


print("Number of columns:", surf_albedo_df.shape[1])
print("\nCount per column:")
print(surf_albedo_df.count())

Number of columns: 6

Count per column:
Surface_Albedo_Houston    27
Surface_Albedo_LA         27
Surface_Albedo_NY         27
Surface_Albedo_Miami      27
Surface_Albedo_Seattle    27
Surface_Albedo_Tampa      27
dtype: int64


**Normality Test: Shapiro Wilk Test**

**Check if the distribution of the Surface Albedo for each city is normal**

In [5]:
from scipy.stats import shapiro
# Drop the first column
# df2 = df.drop(df.columns[0], axis=1)

# Select numeric columns
numeric_cols = surf_albedo_df.select_dtypes(include='number').columns

# Run Shapiro-Wilk test
results = {}

for col in numeric_cols:
    stat, p = shapiro(surf_albedo_df[col])
    results[col] = {"W_statistic": stat, "p_value": p}

# Print results
for col, res in results.items():
    print(f"{col}: W={res['W_statistic']:.4f}, p={res['p_value']:.4g}")

Surface_Albedo_Houston: W=0.9540, p=0.2682
Surface_Albedo_LA: W=0.9476, p=0.1875
Surface_Albedo_NY: W=0.9625, p=0.4199
Surface_Albedo_Miami: W=0.8722, p=0.003274
Surface_Albedo_Seattle: W=0.9286, p=0.06375
Surface_Albedo_Tampa: W=0.7708, p=4.449e-05


**Test for Homogeneity: Levene Test (Non-parametric)**

In [6]:
import pandas as pd
from scipy.stats import levene

# df = pd.read_csv("/content/qbmult_dataset.csv")

# Split groups
# arms_male = df[df["SEX"] == 1]["ARMS"]
# arms_female = df[df["SEX"] == 2]["ARMS"]

# Levene test
stat, p = levene(surf_albedo_df['Surface_Albedo_Houston'],
                 surf_albedo_df['Surface_Albedo_LA'],
                 surf_albedo_df['Surface_Albedo_Miami'],
                 surf_albedo_df['Surface_Albedo_NY'],
                 surf_albedo_df['Surface_Albedo_Seattle'],
                 surf_albedo_df['Surface_Albedo_Tampa'])
print("Levene statistic:", stat)
print("p value:", p)

Levene statistic: 28.78635426468311
p value: 1.2924987840810369e-20


**Test for Homogeneity: Bartlett's Test (Parametric)**

In [7]:

from scipy.stats import bartlett

stat, p = bartlett(surf_albedo_df['Surface_Albedo_Houston'],
                    surf_albedo_df['Surface_Albedo_LA'],
                    surf_albedo_df['Surface_Albedo_Miami'],
                    surf_albedo_df['Surface_Albedo_NY'],
                    surf_albedo_df['Surface_Albedo_Seattle'],
                    surf_albedo_df['Surface_Albedo_Tampa'])
print("Bartlett statistic:", stat)
print("p value:", p)



Bartlett statistic: 240.49558328980942
p value: 6.010813203961508e-50


**T-test** (Not used)

In [None]:

# import pandas as pd
# from scipy.stats import ttest_ind

# # Load your dataset
# df = pd.read_csv("/content/qbmult_dataset.csv")

# # Drop missing values if any
# df = df.dropna(subset=["ARMS", "SEX"])

# # Split data by gender
# arms_male = df[df["SEX"] == 1]["ARMS"]
# arms_female = df[df["SEX"] == 2]["ARMS"]

# # Run independent samples t-test
# t_stat, p_value = ttest_ind(arms_male, arms_female, equal_var=False)   # Welch version

# print("t statistic:", t_stat)
# print("p value:", p_value)


t statistic: 15.585022528601815
p value: 2.621630951194982e-40


**Mann-Whitney U test (Non-parametric)**

Null hypothesis (H₀): The distributions of both populations are identical.

Alternative hypothesis (H₁): The distributions of the two populations are not identical.

(Not used)

In [None]:

# from scipy.stats import mannwhitneyu

# # Split the data
# group1 = df[df["SEX"] == 1]["ARMS"].dropna()
# group2 = df[df["SEX"] == 2]["ARMS"].dropna()

# # Mann Whitney U test
# u_stat, p_value = mannwhitneyu(group1, group2, alternative="two-sided")

# print("U statistic:", u_stat)
# print("p value:", p_value)


In [10]:

import pandas as pd
from scipy.stats import f_oneway


In [None]:
# Not used
# df = pd.read_csv("/content/qbmult_dataset.csv")

# # Extract ARMS variable by IPAQ groups
# arms_0 = df[df["IPAQ"] == 0]["ARMS"]
# arms_1 = df[df["IPAQ"] == 1]["ARMS"]
# arms_2 = df[df["IPAQ"] == 2]["ARMS"]

# groups = df.groupby("IPAQ")

# print("\nNormality test by IPAQ:")
# for name, group in groups:
#     data = group["ARMS"].dropna()
#     n=len(data)
#     stat, p = shapiro(group["ARMS"])
#     print(f"IPAQ = {name}: n = {n}, W = {stat:.4f}, p = {p:.4g}")


**Kruskal-Wallis test**
Non-parametric equivalent of one way ANOVA

In [8]:

import pandas as pd
from scipy.stats import kruskal

# Load your data
# df = pd.read_csv("/content/qbmult_dataset.csv")

# Drop missing values
# df = df.dropna(subset=["ARMS", "IPAQ"])

# Split into groups
# group0 = df[df["IPAQ"] == 0]["ARMS"]
# group1 = df[df["IPAQ"] == 1]["ARMS"]
# group2 = df[df["IPAQ"] == 2]["ARMS"]

# Run Kruskal Wallis
stat, p = kruskal(surf_albedo_df['Surface_Albedo_Houston'],
                    surf_albedo_df['Surface_Albedo_LA'],
                    surf_albedo_df['Surface_Albedo_Miami'],
                    surf_albedo_df['Surface_Albedo_NY'],
                    surf_albedo_df['Surface_Albedo_Seattle'],
                    surf_albedo_df['Surface_Albedo_Tampa'])

print("Kruskal Wallis statistic:", stat)
print("p value:", p)

Kruskal Wallis statistic: 118.30385940523739
p value: 7.176724067002293e-24


In [11]:

!pip install scikit-posthocs

Collecting scikit-posthocs
  Downloading scikit_posthocs-0.11.4-py3-none-any.whl.metadata (5.8 kB)
Downloading scikit_posthocs-0.11.4-py3-none-any.whl (33 kB)
Installing collected packages: scikit-posthocs
Successfully installed scikit-posthocs-0.11.4


In [13]:


import scikit_posthocs as sp

cols = [surf_albedo_df[col] for col in surf_albedo_df.columns]
sp.posthoc_dunn(cols, p_adjust="bonferroni")

Unnamed: 0,1,2,3,4,5,6
1,1.0,0.1984213,1.777851e-08,3.336216e-20,1.668637e-08,2.580978e-12
2,0.1984213,1.0,0.004688433,3.415014e-11,0.004508456,1.502484e-05
3,1.777851e-08,0.004688433,1.0,0.009682652,1.0,1.0
4,3.336216e-20,3.415014e-11,0.009682652,1.0,0.01004962,0.5036977
5,1.668637e-08,0.004508456,1.0,0.01004962,1.0,1.0
6,2.580978e-12,1.502484e-05,1.0,0.5036977,1.0,1.0


In [None]:
# Not used
# # -*- coding: utf-8 -*-
# """ANOVA

# Automatically generated by Colab.

# Original file is located at
#     https://colab.research.google.com/drive/1Tx1mQqVHxj0_Y63uhd1lbhGZKA3_3PMX
# """

# import pandas as pd
# import statsmodels.api as sm
# from statsmodels.formula.api import ols

# # Dataset
# IRON_FORM = pd.DataFrame({
#     "score": [20.5, 28.1, 27.8, 27.0, 28.0,
# 25.2, 25.3, 27.1, 20.5, 31.3,26.3, 24.0, 26.2, 20.2, 23.7,
# 34.0, 17.1, 26.8, 23.7, 24.9,29.5, 34.0, 27.5, 29.4, 27.9,
# 26.2, 29.9, 29.5, 30.0, 35.6,36.5, 44.2, 34.1, 30.3, 31.4,
# 33.1, 34.1, 32.9, 36.3, 25.5],
#     "group": ["Type 1"]*10 + ["Type 2"]*10 + ["Type 3"]*10+["Type 4"]*10
# })

# # Fit model
# model = ols('score ~ C(group)', data=IRON_FORM).fit()

# # ANOVA table
# anova_table = sm.stats.anova_lm(model, typ=2)
# print(anova_table)

# # Compute group means
# means = IRON_FORM.groupby('group')['score'].mean().reset_index()
# means = means.rename(columns={'score': 'mean'})

# print("\nGroup Means:")
# print(means)


# from statsmodels.stats.multicomp import pairwise_tukeyhsd
# tukey = pairwise_tukeyhsd(IRON_FORM['score'], IRON_FORM['group'])
# print(tukey)

# import pandas as pd
# from scipy.stats import shapiro


In [None]:
# Already done
# from scipy.stats import bartlett

# # # Bartlett test for equal variances across IPAQ groups
# # stat, p = bartlett(arms_0, arms_1, arms_2)

# # print("Bartlett statistic:", stat)
# # print("p value:", p)

# # # Levene test for equal variances
# # stat, p = levene(arms_0, arms_1, arms_2)

# # print("Levene statistic:", stat)
# # print("p value:", p)

f_stat, p_val = f_oneway(surf_albedo_df['Surface_Albedo_Houston'],
                    surf_albedo_df['Surface_Albedo_LA'],
                    surf_albedo_df['Surface_Albedo_Miami'],
                    surf_albedo_df['Surface_Albedo_NY'],
                    surf_albedo_df['Surface_Albedo_Seattle'],
                    surf_albedo_df['Surface_Albedo_Tampa'])

print("F statistic:", f_stat)
print("p value:", p_val)

F statistic: 2601.0598038723783
p value: 2.9895393735450893e-148


In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Reshape the DataFrame to a long format suitable for Tukey HSD
df_melted = surf_albedo_df.melt(var_name='city', value_name='surface_albedo')

# Perform Tukey's test
tukey = pairwise_tukeyhsd(endog=df_melted['surface_albedo'],
                          groups=df_melted['city'],
                          alpha=0.05)

print(tukey)

              Multiple Comparison of Means - Tukey HSD, FWER=0.05              
       group1              group2       meandiff p-adj  lower    upper   reject
-------------------------------------------------------------------------------
Temperature_Houston      Temperature_LA   -2.921   0.0  -3.3948  -2.4473   True
Temperature_Houston   Temperature_Miami    4.485   0.0   4.0112   4.9587   True
Temperature_Houston      Temperature_NY  -9.1175   0.0  -9.5913  -8.6437   True
Temperature_Houston Temperature_Seattle -10.2009   0.0 -10.6746  -9.7271   True
Temperature_Houston   Temperature_Tampa   1.6488   0.0    1.175   2.1226   True
     Temperature_LA   Temperature_Miami    7.406   0.0   6.9322   7.8798   True
     Temperature_LA      Temperature_NY  -6.1964   0.0  -6.6702  -5.7227   True
     Temperature_LA Temperature_Seattle  -7.2798   0.0  -7.7536   -6.806   True
     Temperature_LA   Temperature_Tampa   4.5698   0.0    4.096   5.0436   True
  Temperature_Miami      Temperature_NY 

In [None]:

# Not used
# !pip install pingouin


**Welch ANOVA**
Not used

In [None]:

# import pingouin as pg

# # Welch ANOVA: ARMS by IPAQ
# welch = pg.welch_anova(dv="ARMS", between="IPAQ", data=df)

# print(welch)

In [None]:


# import pingouin as pg

# gh = pg.pairwise_gameshowell(dv="ARMS", between="IPAQ", data=df)
# print(gh)

Since a difference was shown, averaging is not possible here