In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

url = 'amz_uk_price_prediction_dataset.csv'
df = pd.read_csv(url)
df

## Part 1: Analyzing Best-Seller Trends Across Product Categories

### Objective: Understand the relationship between product categories and their best-seller status.

### 1. Crosstab Analysis

In [None]:
# Create a crosstab between the product category and the isBestSeller status.

crosstab_result = pd.crosstab(
    df['category'], df['isBestSeller']).round(2)

# Are there categories where being a best-seller is more prevalent?

crosstab_result_proportional = pd.crosstab(
    df['category'], df['isBestSeller'], 
    normalize='index'
).round(2)

crosstab_result_proportional.sort_values(True, ascending=False)

# Most of the items are not best-sellers for example. Only 6% of Smart Home Security & Lightting is a best seller.

### 2. Statistical Tests:

In [None]:
# Conduct a Chi-square test to determine if the best-seller distribution is independent of the product category.

from scipy.stats import chi2_contingency
chi2_statistic, chi2_p_value, _, _ = chi2_contingency(crosstab_result)

display(chi2_statistic.round(2), chi2_p_value)
#No sufficient evidence to conclude that variables are related

# Compute Cramér's V to understand the strength of association between best-seller status and category.

from scipy.stats.contingency import association

association(crosstab_result.astype(int), method="cramer")

#There is a small a medium association between variables.

### 3. Visualizations:

In [None]:
# Visualize the relationship between product categories and the best-seller status using a stacked bar chart.
crosstab_result_top5 = crosstab_result.head(5)

crosstab_sorted = crosstab_result.sort_values(by=[True,False], axis=0, ascending=False).head(10)
crosstab_sorted.plot(kind="bar", stacked=True)
plt.show()

## Part 2: Exploring Product Prices and Ratings Across Categories and Brands

### Objective: Investigate how different product categories influence product prices.



### 0. Preliminary Step: Remove outliers in product prices.

In [None]:


#For this purpose, we can use the IQR (Interquartile Range) method. 
#Products priced below the first quartile minus 1.5 times the IQR or above the third quartile plus 1.5 times the IQR will be considered outliers and 
#removed from the dataset. The next steps will be done with the dataframe without outliers.

def tukeys_test_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1

    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify the outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]

    return outliers

df = df[~df['price'].isin(tukeys_test_outliers(df['price']))]

### 1. Violin Plots:

In [None]:
# Use a violin plot to visualize the distribution of price across different product categories. 
#Filter out the top 20 categories based on count for better visualization.

top_categories = df['category'].value_counts().head(20).index

# Filter the DataFrame for these categories
category_filtered_df = df[df['category'].isin(top_categories)]

# Create the violin plot
sns.violinplot(data=category_filtered_df, x='category', y='price', palette="coolwarm", hue='category')
plt.xticks(rotation=45)
plt.show()

# Which product category tends to have the highest median price? Don't filter here by top categories.

df.groupby('category')['price'].agg('median').sort_values(ascending=False)

#Desktops have the highest median price.

### 3. Box Plots:

In [None]:
# Visualize the distribution of product ratings based on their category using side-by-side box plots. 
#Filter out the top 10 categories based on count for better visualization.

top_categories = df['category'].value_counts().head(10).index
category_filtered_df = df[df['category'].isin(top_categories)]

sns.boxplot(data=category_filtered_df, x='category', y='stars', palette="coolwarm", hue='category', legend=False)
plt.xticks(rotation=45)
plt.title('Product Rating Distributions')
plt.show()


# Which category tends to receive the highest median rating from customers? Don't filter here by top categories.


df.groupby('category')['stars'].agg('median').sort_values(ascending=False)

#The category that receives the highest median rating from customers is 'Computer Memory'

## Part 3: Investigating the Interplay Between Product Prices and Ratings

### Objective: Analyze how product ratings (stars) correlate with product prices.

## 1. Correlation Coefficients:

In [None]:
# Calculate the correlation coefficient between price and stars.

correlation = df['price'].corr(df['stars'], method='pearson')
correlation

#Is there a significant correlation between product price and its rating?

#There seems to be a negative correlation between price and rating. Which indicates that the more the price increases
#the more the rating could decrease.

## 2. Visualizations

In [None]:
#Use a scatter plot to visualize the relationship between product rating and price. What patterns can you observe?

sns.scatterplot(data=df, x='stars', y='price')
plt.show()

In [None]:
#Use a correlation heatmap to visualize correlations between all numerical variables.
df_numerical = df.select_dtypes("number").drop(columns=potential_categorical_from_numerical.columns)

correlation_matrix = df_numerical.corr()

# Setting up the matplotlib figure with an appropriate size
plt.figure(figsize=(6, 6))

# Drawing the heatmap for the numerical columns
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap for Selected Numerical Variables")
plt.show()

In [None]:
# Examine if product prices typically follow a normal distribution using a QQ plot.
import statsmodels.api as sm

# Generating a Q-Q plot for 'SalePrice' to check if its distribution follows a normal distribution
sm.qqplot(df['price'], line='s');
plt.show()

#The product price does not follow a normal distribution using a QQ plot.