In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [2]:
df =pd.read_csv('amz_uk_price_prediction_dataset.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: 'amz_uk_price_prediction_dataset.csv'

In [None]:
#Business Question: What are the most popular product categories on Amazon UK, and how do they compare in terms of listing frequency?

### Part 1: Understanding Product Categories

In [None]:
# Extracting column names with numerical data types from the dataframe
df.select_dtypes("number").columns

# Counting and sorting the unique values for each numerical column in descending order
df.select_dtypes("number").nunique().sort_values(ascending=False)
df.select_dtypes("object").nunique().sort_values(ascending=False)

In [None]:
freq_table=df['category'].value_counts()

proportion_table = df['category'].value_counts(normalize=True)
#freq_table,proportion_table

#the top 5 most listed product categories
most_listed_product= freq_table.head(5)
display(most_listed_product)    

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    y=most_listed_product.index,
    x=most_listed_product.values,
    palette="Set2"
)
# Add labels and title
ax.set_title("Top 5 Most Listed Categories")
ax.set_xlabel("Count")
ax.set_ylabel("Category")

In [None]:
plt.figure(figsize=(6, 6))
plt.pie(
    most_listed_product.values,
    labels=most_listed_product.index,
    autopct='%1.1f%%',
    startangle=150,
    colors=plt.cm.Set2.colors
)

plt.title("Proportion of Top 5 Categories in Listings")
plt.show()

### Part 2: Delving into Product Pricing

In [None]:
df.describe()

In [None]:
top_categories = df['category'].value_counts().head(5).index
top_categories_df = df[df['category'].isin(top_categories)]
centrality_df= df['price'].mean(),df['price'].median(),df['price'].mode()[0]
dispersion_df= df['price'].var(),df['price'].std()
iqr_price =df['price'].quantile(0.75)- df['price'].quantile(0.25)
range_price=df['price'].max() - df['price'].min()

#display(centrality_df,dispersion_df,iqr_price,range_price)


plt.figure(figsize=(8, 6))
sns.histplot(data=top_categories_df, x='price', hue='category', bins=30, kde=False)
plt.title("Distribution of Product Prices in Top 5 Categories")
plt.xlabel('price', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', alpha=0.75)
plt.show()


plt.figure(figsize=(8, 6))
plt.boxplot(x=df['price'])
plt.title('Box Plot of Product Prices')
plt.xlabel('Price',fontsize=12)
plt.show()



### Part 3: Unpacking Product Ratings

In [None]:
centrality_df = {
    "Mean": df['stars'].mean(),
    "Median": df['stars'].median(),
    "Mode": df['stars'].mode()[0]
}
print("Centrality Measures:")
print(centrality_df)

# Calculate dispersion measures
dispersion_df = {
    "Variance": df['stars'].var(),
    "Standard Deviation": df['stars'].std()
}
print("\nDispersion Measures:")
print(dispersion_df)

# Calculate IQR and range
iqr_price = df['stars'].quantile(0.75) - df['stars'].quantile(0.25)
range_price = df['stars'].max() - df['stars'].min()
print("\nIQR of Price:", iqr_price)

skewness_price = df['stars'].skew()
kurtosis_price = df['stars'].kurtosis()

print(skewness_price, kurtosis_price)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['stars'], bins=10, kde=True, color='blue')
plt.title("Distribution of Product Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()