# Amazon Best Selling Products 💸💰

**Sometimes we think of starting a side hustle. Selling on Amazon could be a one option but "what should I sell ?" is one of the very Important and Time consuming Question.This analysis, on more than 1.4 million Products will help us ease this process. We'll study the data to find trends, like what's selling well or what's becoming more popular. With this info, we can make smart choices about what to sell on Amazon.**

![](https://hips.hearstapps.com/hmg-prod/images/best-amazon-products-1674055005.jpg)

# Introduction

## Importing Libraries and Dataset

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
from bokeh.plotting import figure, show
import altair as alt
from ggplot import *
import holoviews as hv
import folium

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amazon-products-dataset/amazon_products.csv
/kaggle/input/amazon-products-dataset/amazon_categories.csv


In [None]:
# Specify the file paths for the CSV files
categories_file_path = "/kaggle/input/amazon-products-dataset/amazon_categories.csv"
products_file_path = "/kaggle/input/amazon-products-dataset/amazon_products.csv"

# Read the CSV files into DataFrames
categories_df = pd.read_csv(categories_file_path)
products_df = pd.read_csv(products_file_path)

# If there's no common key, you can concatenate them:
data = pd.concat([products_df, categories_df], axis=1)

# Exploratory Data Analysis

In [1]:
print("First 10 rows")
data.head(10)

First 10 rows


NameError: name 'data' is not defined

In [None]:
shape = data.shape

# Printing the shape
print("Shape of the DataFrame: Rows={}, Columns={}".format(shape[0], shape[1]))

In [None]:
data.describe()

In [None]:
print("Info of the data")
data.info()

In [None]:
unique_values = data.nunique()
unique_values

## Percentage of non unique values

In [None]:
total_rows = len(data)

unique_rows = len(data.drop_duplicates())

non_unique_rows = total_rows - unique_rows

percentage_non_unique = (non_unique_rows / total_rows) * 100

print("Percentage of Non-Unique Values: {:.2f}%".format(percentage_non_unique))

In [None]:
data.tail(10)

## Total number of products and type

In [None]:
total_products = len(data)
total_products

In [None]:
data_types = data.dtypes
data_types

## Category Analysis

In [None]:
# Average price and rating by category
avg_price_by_category = data.groupby('category_name')['price'].mean()
avg_price_by_category

In [None]:
# Average price and rating by category
avg_rating_by_category = data.groupby('category_name')['stars'].mean()
avg_rating_by_category

In [None]:
# Top categories by the number of products
top_categories = data['category_name'].value_counts().head(10)
top_categories

# Best Selling Products

In [None]:
best_selling_products = data[data['isBestSeller']]
best_selling_products

In [None]:
# Compare the ratings
avg_rating_best_sellers = best_selling_products['stars'].mean()
avg_rating_best_sellers

In [None]:
# Compare prices
avg_price_best_sellers = best_selling_products['price'].mean()
avg_price_best_sellers

In [None]:
# Distribution of prices
price_distribution = data['price'].describe()
price_distribution

In [None]:
# Average rating and review count
avg_rating = data['stars'].mean()
avg_review_count = data['reviews'].mean()

In [None]:
# Correlation between ratings and reviews
correlation = data['stars'].corr(data['reviews'])
correlation

# Data Visualization

In [None]:
# . Calculate the average price and rating for products in each category
avg_price_by_category = data.groupby('category_name')['price'].mean().reset_index()
avg_price_by_category.columns = ['Category', 'Average Price']

avg_rating_by_category = data.groupby('category_name')['stars'].mean().reset_index()
avg_rating_by_category.columns = ['Category', 'Average Rating']


# . Create an interactive bar chart for average prices
fig2 = px.bar(avg_price_by_category.head(10), x='Average Price', y='Category', orientation='h',
              title='Average Price by Category',
              labels={'Average Price': 'Average Price'})

# . Create an interactive bar chart for average ratings
fig3 = px.bar(avg_rating_by_category.head(10), x='Average Rating', y='Category', orientation='h',
              title='Average Rating by Category',
              labels={'Average Rating': 'Average Rating'})

# 6. Display the interactive charts
fig2.show()
fig3.show()

In [None]:
# Filter the DataFrame to get best-selling products
best_selling_products = data[data['isBestSeller']]

# Create a scatter plot to compare prices and ratings
fig = px.scatter(best_selling_products, x='price', y='stars', color='isBestSeller',
                 title='Comparison of Prices and Ratings for Best-Selling Products',
                 labels={'price': 'Price', 'stars': 'Rating'},
                 hover_data=['title'])

# Customize the chart
fig.update_layout(
    xaxis=dict(title_font=dict(size=14, family='Arial', color='white')),
    yaxis=dict(title_font=dict(size=14, family='Arial', color='white')),
    title_font=dict(size=20, family='Arial', color='white'),
    legend_title_font=dict(size=14, family='Arial', color='white'),
    paper_bgcolor='black',
    font=dict(color='white'),
)

# Display the interactive chart
fig.show()

## Most Expensive Products

In [None]:
# Sort the data by price in descending order to get the most expensive products
top_10_expensive = data.sort_values(by='price', ascending=False).head(10)

# Create an interactive bar chart to display the top 10 most expensive products
fig = px.bar(top_10_expensive, x='price', y='title',
             orientation='h',
             title='Top 10 Most Expensive Products',
             labels={'price': 'Price', 'title': 'Product'},
             color_discrete_sequence=['#1f77b4'])

# Customize the chart appearance
fig.update_layout(
    xaxis=dict(title_font=dict(size=14, family='Arial', color='white')),
    yaxis=dict(title_font=dict(size=14, family='Arial', color='white')),
    title_font=dict(size=20, family='Arial', color='white'),
    legend_title_font=dict(size=14, family='Arial', color='white'),
    paper_bgcolor='black',
    font=dict(color='white'),
)

# Display the interactive chart
fig.show()

In [None]:
# Find the product with the highest rating
best_product = data[data['stars'] == data['stars'].max()]

# Print the best product information
print("Best Product with Highest Rating:")
print(best_product[['title', 'stars']])

In [None]:
# Find the product with the highest rating
best_product = data[data['stars'] == data['stars'].max()]

# Create a textual visualization
plt.figure(figsize=(8, 4))
plt.text(0.5, 0.5, f"Best Product with Highest Rating:\n\nTitle: {best_product['title'].values[0]}\nRating: {best_product['stars'].values[0]}", 
         fontsize=14, ha='center', va='center', color='black')
plt.axis('off')
plt.show()

In [None]:
# Find the product with the lowest rating
worst_product = data[data['stars'] == data['stars'].min()]

# Create a textual visualization for the product with the lowest rating
plt.figure(figsize=(8, 4))
plt.text(0.5, 0.5, f"Worst Product with Lowest Rating:\n\nTitle: {worst_product['title'].values[0]}\nRating: {worst_product['stars'].values[0]}", 
         fontsize=14, ha='center', va='center', color='black')
plt.axis('off')
plt.show()

# Model for predicting the best-selling products for the next year!

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# Select relevant features (reviews and prices) and target variable (isBestSeller)
X = data[['reviews', 'price']]
y = data['isBestSeller']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, predictions))

In [None]:
# Select relevant features (reviews and prices) and target variable (isBestSeller)
X = data[['reviews', 'price']]
y = data['isBestSeller']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions = model.predict(X_test)

# Create a DataFrame to map product information to predictions
product_predictions = pd.DataFrame({'Title': X_test.index.map(data['title']),
                                    'IsBestSeller': y_test,
                                    'PredictedIsBestSeller': predictions})

# Filter for products predicted as best sellers (True)
predicted_best_sellers = product_predictions[product_predictions['PredictedIsBestSeller']]

# Display the products predicted as best sellers
print(predicted_best_sellers)