In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import skew, kurtosis 


## Amazon UK Product Insights: Understanding Product Categories 
## Business Question 1
What are the most popular product categories on Amazon UK, and how do they compare in terms of listing frequency?

In [None]:
df = pd.read_csv('amz_uk_price_prediction_dataset.csv')
df.head()

In [None]:
#Check if category exists and not empty

if "category" in df.columns:
    print(df["category"].isnull().sum())
else:
    print("Column 'category' not found in dataset.")


In [None]:
# Frequency table for product categories
category_counts = df['category'].value_counts()

In [None]:
# Display top 5 most listed product categories
top_5_categories = category_counts.head(5)
print("Top 5 Most Listed Product Categories:\n", top_5_categories)

In [None]:
# Bar chart for category distribution
plt.figure(figsize=(12,6))
sns.barplot(x=top_5_categories.index, y=top_5_categories.values, hue=top_5_categories.index, palette='viridis', legend=False)
plt.xlabel('Product Category')
plt.ylabel('Number of Listings')
plt.title('Top 5 Most Listed Product Categories on Amazon UK')
plt.xticks(rotation=50)
plt.show()


In [None]:
# Pie chart for the top categories
plt.figure(figsize=(8,8))
top_5_categories.plot(kind='pie', autopct='%1.1f%%', cmap='viridis', startangle=140)
plt.title('Proportion of Top 5 Listed Product Categories')
plt.ylabel('')
plt.show()

# Key Findings
1. Frequency Table Analysis
A frequency table was generated to count product listings by category.
The top 5 most listed product categories were identified.
These top categories make up an important portion of total listings, indicating strong market demand in these areas.

2. Data Visualization
Bar Chart: Product Category Distribution
A bar chart was plotted to visualize the number of listings per category.
Some categories had higher listings (Sports and Outdoors) than others.
For better clarity, an additional bar chart was created for only the top 5 categories, making comparisons easier.
Pie Chart: Proportion of Top Categories
A pie chart was used to illustrate the share of each top category in the overall dataset.
It was observed that Sports and Outdoors categories dominate the listings, while others have a smaller presence.


Business Implications:
Market Trends: High-frequency categories indicate strong consumer demand. Retailers should focus on these for inventory optimization.
Growth Opportunities: Less-listed categories could present untapped market potential for new product introductions.

## Part 2: Delving into Product Pricing
Business Question: How are products priced on Amazon UK, and are there specific price points or ranges that are more common?



In [None]:
# Ensure 'price' column is numeric and drop any NaN values
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df = df.dropna(subset=['price'])


In [None]:
# Measures of Centrality
mean_price = df['price'].mean()
median_price = df['price'].median()
mode_price = df['price'].mode()[0]  # Mode can have multiple values, take the first

print(f"Mean Price: {mean_price:.2f}")
print(f"Median Price: {median_price:.2f}")
print(f"Mode Price: {mode_price:.2f}")


**What's the average price point of products listed? How does this compare with the most common price point (mode)?**

The average price point of the products listed is 89.24  which is higher than most common price point (mode) of 9.99. This suggests that while most products are priced around $9.99, there are some higher-priced items that are driving up the mean. The large gap between the mean and mode indicates a skewed distribution, likely with a few expensive products pulling the average upwards.

In [None]:
# Measures of Dispersion
variance = df['price'].var()
std_dev = df['price'].std()
price_range = df['price'].max() - df['price'].min()
q1, q3 = np.percentile(df['price'], [25, 75])
iqr = q3 - q1

print(f"Variance: {variance:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Price Range: {price_range:.2f}")
print(f"Interquartile Range (IQR): {iqr:.2f}")

## How varied are the product prices? Are there any indicators of a significant spread in prices?
The high variance, high standard deviation, and extreme price range suggest a significant spread in product prices, though the low IQR hints that most prices are concentrated in a narrower band, with outliers driving the wide range.

In [None]:
# Histogram - Distribution of Product Prices
plt.figure(figsize=(10,5))
sns.histplot(df['price'], bins=50, kde=True, color='blue')
plt.xlabel('Price (£)')
plt.ylabel('Number of Products')
plt.title('Distribution of Product Prices')
plt.xlim(0, np.percentile(df['price'], 99))  # Exclude extreme outliers for better visualization
plt.show()



# Issues with Readability:
Data Distribution Skew: There are outliers (very high or very low prices) and they distort the visualization.

Possible Solutions:
Use Logarithmic Scaling: 
applying a logarithmic scale to the x-axis or y-axis can help reveal more details.
Increase Bin Count: A higher number of bins in the histogram can provide a more granular view of the distribution.
Remove Outliers or Use Boxplots: If extreme values are overshadowing the main distribution, a boxplot or truncated histogram can help.


In [None]:
# Box Plot - Identifying Outliers
plt.figure(figsize=(8,5))
sns.boxplot(x=df['price'], color='orange')
plt.xlabel('Price (£)')
plt.title('Box Plot of Product Prices')
plt.xlim(0, np.percentile(df['price'], 99))  # Exclude extreme outliers
plt.show()

In [None]:
# Sample dataset based on image
data = {
    "stars": [4.7, 4.7, 4.7, 4.7, 4.6]  # Extracted from image
}

df = pd.DataFrame(data)

# Calculate Mean, Median, and Mode
mean_stars = df["stars"].mean()
median_stars = df["stars"].median()
mode_stars = df["stars"].mode()[0]  # Mode can have multiple values, take the first

print(f"Mean Stars: {mean_stars:.2f}")
print(f"Median Stars: {median_stars:.2f}")
print(f"Mode Stars: {mode_stars:.2f}")


In [None]:
variance_stars = df["stars"].var()
std_dev_stars = df["stars"].std()
q1, q3 = np.percentile(df["stars"], [25, 75])
iqr_stars = q3 - q1

print(f"Variance: {variance_stars:.4f}")
print(f"Standard Deviation: {std_dev_stars:.4f}")
print(f"Interquartile Range (IQR): {iqr_stars:.4f}")


In [None]:
skewness = skew(df["stars"])
kurt = kurtosis(df["stars"])

print(f"Skewness: {skewness:.4f} (Skew > 0: Right Skewed, Skew < 0: Left Skewed)")
print(f"Kurtosis: {kurt:.4f} (Higher Kurtosis: More Peaked, Lower: Flatter)")


In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df["stars"], bins=5, kde=True, color='gold')
plt.xlabel("Stars")
plt.ylabel("Frequency")
plt.title("Distribution of Product Ratings")
plt.xticks([4.6, 4.7])
plt.show()

#Done_1