# <font face = 'Impact' color = '#FFAEBC' > Exploring Associations between Variables <font/>
#### <font face = 'Times New Roman' color = '#B5E5CF'> License: GPL v3.0<font/>
#### <font face = 'Times New Roman' color = '#B5E5CF'> Author and Trainer: Paolo Hilado MSc. (Data Science)<font/>
This notebook provides a comprehensive analysis of variable associations while ensuring key statistical assumptions are met. It begins with an exploratory data analysis, checking for normality, linearity, presence of outliers, and homoscedasticity using visualizations and statistical tests. After verifying assumptions, the notebook applies appropriate correlation techniques such as Pearson, Spearman, and Kendallâ€™s Tau-b, ensuring robust insights into the relationships between variables.

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import qdesc as qd

In [None]:
# Loading our data set SyntheticRetail.xlsx
df = pd.read_excel("SyntheticRetail.xlsx")
df.head()

In [None]:
# Checking out the information about the dataframe.
df.info()

In [None]:
# Generating Quick Descriptives and AD Statistic
qd.desc(df)

In [None]:
# Doing a normality check using AD-test and Data Visualizations
qd.normcheck_dashboard(df)

In [None]:
# Correlating Customer_Count and Daily_Sales
# Using Pearson Correlation since the variables are normally distributed.
pearson_corr, pearson_p = stats.pearsonr(df['Customer_Count'], df['Daily_Sales'])
print(f"Pearson correlation coefficient: {pearson_corr:.4f}, p-value: {pearson_p:.4f}")

In [None]:
# Checking the scatterplot to visualize the relationship of these variables
sns.scatterplot(x=df['Customer_Count'], y=df['Daily_Sales'])
plt.title('Scatter Plot for Linearity Check')
plt.show()

In [None]:
# Correlating Ad_Spend and Customer_Count
# Choosing the Kendall's tau-b given that one of the variable is not normal.
kendall_corr, kendall_p = stats.kendalltau(df['Ad_Spend'], df['Customer_Count'])
print(f"Kendall's Tau-b correlation coefficient: {kendall_corr:.4f}, p-value: {kendall_p:.4f}")

In [None]:
# Checking the scatterplot to visualize the relationship of these variables
sns.scatterplot(x=df['Customer_Count'], y=df['Ad_Spend'])
plt.title('Scatter Plot for Linearity Check')
plt.show()

In [None]:
# Correlating Discount_Rate and Customer_Count
# Choosing the Spearman Rho given that one of the variable is not normal.
spearman_corr, spearman_p = stats.spearmanr(df['Discount_Rate'], df['Customer_Count'])
print(f"Spearman correlation coefficient: {spearman_corr:.4f}, p-value: {spearman_p:.4f}")

In [None]:
# Checking the scatterplot to visualize the relationship of these variables
sns.scatterplot(x=df['Customer_Count'], y=df['Discount_Rate'])
plt.title('Scatter Plot for Linearity Check')
plt.show()