In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('/Users/f/Documents/Ironhack/lab-eda-univariate/amz_uk_price_prediction_dataset.csv')

In [None]:
df

In [None]:
#Create a crosstab between the product `category` and the `isBestSeller` status.
crosstab_result = pd.crosstab(df['category'], df['isBestSeller'])
crosstab_result

In [None]:
# Calculate the proportion of best-sellers for each category
crosstab_result_sorted = pd.DataFrame(crosstab_result)
crosstab_result_sorted['proportion_best_seller'] = (crosstab_result_sorted[True] / crosstab_result_sorted.sum(axis=1)) * 100

# Sort the categories based on the proportion of best-sellers in descending order
crosstab_result_sorted = crosstab_result_sorted.sort_values(by='proportion_best_seller', ascending=False)

# Display the sorted crosstab table
crosstab_result_sorted


In [None]:
# Conduct a Chi-square test to determine if the best-seller distribution is independent of the product category.
from scipy.stats import chi2_contingency
chi2_statistic, chi2_p_value, _, _ = chi2_contingency(crosstab_result)

chi2_statistic, chi2_p_value

# p-value of 0.0 indicates that there is assosiation between the two categories. 
# Given the extremely low p-value, we reject the null hypothesis. 
# This implies that there's a significant association between `proportion_best_seller` and `category` in the dataset.


In [None]:
from scipy.stats.contingency import association

association(crosstab_result, method="cramer")

# Value 0.12 indicates weak association between category and best-seller status
# The Cramér's V value of 0.12 suggests a weak association between the two variables. 
# Though statistically significant, the strength of this relationship is not very strong in practical terms.

In [None]:
# Visualize the relationship between product categories and the best-seller status using a stacked bar chart.
# Plotting a stacked bar chart for the 'crosstab_result' data
crosstab_result.plot(kind="bar", stacked=True)

In [None]:
# Calculate Q1, Q3, and IQR
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df_no_outliers = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]
df_no_outliers

In [None]:
# Use a violin plot to visualize the distribution of price across different product categories. Filter out the top 20 categories based on count for better visualization.
sns.violinplot(data=df, x='MSZoning', y='SalePrice', palette="coolwarm")

In [None]:
# Step 1: Calculate the count of products in each category
category_counts = df_no_outliers['category'].value_counts()

# Step 2: Select the top 20 categories based on count
top_20_categories = category_counts.head(20).index

# Step 3: Filter the original dataframe to include only the products in these top 20 categories
df_no_outliers_top_20_categories = df_no_outliers[df_no_outliers['category'].isin(top_20_categories)]

# Step 4: Create a violin plot
plt.figure(figsize=(12, 8))
sns.violinplot(x='category', y='price', data=df_no_outliers_top_20_categories)
plt.title('Distribution of Price Across Top 20 Product Categories')
plt.xlabel('Product Category')
plt.ylabel('Price')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Which product category tends to have the highest median price? Don't filter here by top categories.
# Calculate the median price for each product category
median_price_by_category = df_no_outliers.groupby('category')['price'].median()

# Find the category with the highest median price
category_with_highest_median_price = median_price_by_category.idxmax()

# Get the highest median price
highest_median_price = median_price_by_category.max()

print("Category with the highest median price:", category_with_highest_median_price)
print("Highest median price:", highest_median_price)


In [None]:
# Get the top 10 product categories based on count
top10_categories = df_no_outliers['category'].value_counts().nlargest(10).index

# Filter the DataFrame to include only the top 10 categories
df_top10_categories = df_no_outliers[df_no_outliers['category'].isin(top10_categories)]
top10_categories

In [None]:
# Create a bar chart comparing the average price of products for the top 10 product categories (based on count).
plt.figure(figsize=(10, 6))
barplot = sns.barplot(data=df_top10_categories, x='category', y='price', palette='coolwarm')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels and adjust alignment
plt.tight_layout()
plt.show()

In [None]:
# Which product category commands the highest average price? Don't filter here by top categories.
# Group the data by category and calculate the mean price for each category
average_price_by_category = df_no_outliers.groupby('category')['price'].mean()

# Find the category with the highest average price
category_highest_avg_price = average_price_by_category.idxmax()

# Retrieve the highest average price
highest_avg_price = average_price_by_category.max()

print(f"The product category '{category_highest_avg_price}' commands the highest average price of ${highest_avg_price:.2f}.")


In [None]:
# Filter the dataframe to include only the top 10 categories based on count
top10_categories = df_no_outliers['category'].value_counts().nlargest(10).index
df_top10 = df_no_outliers[df_no_outliers['category'].isin(top10_categories)]

# Create side-by-side box plots
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_top10, x='category', y='stars')
plt.title('Distribution of Product Ratings by Category (Top 10)')
plt.xlabel('Category')
plt.ylabel('Rating')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
# Which category tends to receive the highest median rating from customers? Don't filter here by top categories.
# Calculate the median rating for each category
median_ratings = df_no_outliers.groupby('category')['stars'].median()

# Find the category with the highest median rating
highest_median_category = median_ratings.idxmax()

# Print the category with the highest median rating
print("Category with the highest median rating:", highest_median_category)


In [None]:
# Correlation Coefficients:
# Calculate the correlation coefficient between price and stars
correlation_coefficient = df_no_outliers['price'].corr(df_no_outliers['stars'])

# Print the correlation coefficient
print("Correlation coefficient between price and stars:", correlation_coefficient)

In [None]:
from scipy.stats import pearsonr

# Calculate the correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(df_no_outliers['price'], df_no_outliers['stars'])
correlation_coefficient, p_value
#There is a significant correlation between product price and its rating.

In [None]:
# Use a scatter plot to visualize the relationship between product rating and price. What patterns can you observe?
# Scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(df_no_outliers['stars'], df_no_outliers['price'], alpha=0.5)
plt.title('Product Rating vs. Price')
plt.xlabel('Product Rating (Stars)')
plt.ylabel('Price')
plt.grid(True)
plt.show()

In [None]:
#Use a correlation heatmap to visualize correlations between all numerical variables.
#Checking for numerical variables
df_no_outliers.select_dtypes("number").nunique().sort_values(ascending=False)

In [None]:
df_no_outliers['stars'].unique()
# The result indicates that rating is not a continuous  variable

In [None]:
numerical = df_no_outliers.select_dtypes("number").drop([ 'uid'], axis=1)
numerical 

In [None]:
correlation_matrix = numerical.corr()

plt.figure(figsize=(5, 5))

sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

plt.title("Correlation Heatmap for Selected Numerical Variables")
plt.show()

# Heatmap indicates that there is not strong corealtion between numerical variables.

In [None]:
# Examine if product prices typically follow a normal distribution using a QQ plot. 

import statsmodels.api as sm

sm.qqplot(numerical['price'], line='s')

# The distribution of the price do not follow normal distribution