In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [None]:
df = pd.read_csv("../input/flipkart_com-ecommerce_sample.csv", na_values=["No rating available"])

In [None]:
df.info()
df.head()

### Create new variable: discount_percent

In [None]:
# create new variable: discount_percent

df["discount_percent"] = ((df.retail_price - df.discounted_price)*100)/df.retail_price
df.discount_percent.head()

### Create a function to extract nth level of product category from the product category tree
The product categories are stored in the variable "product_category_tree". The following function is used to extract the product categories at a specified level in the tree.

In [None]:
def get_nth_category(dataframe, level=1):
    """extract the level-n product category from the product category tree"""
    
    if level == 1:
        category = dataframe.product_category_tree.apply(lambda x: re.split(" >> ", x)[0]).str[2:]
    else:
        category = dataframe.product_category_tree.apply(lambda x: re.split(" >> ", x)[level:(level+1)])
    
    category = category.replace("[]", "[EMPTY_LEVEL]")  # this line does not work! Suggestions welcome :)

    return category

In [None]:
# print level 4 categories just to see if the function works
print(get_nth_category(df, level=4))

In [None]:
# get primary and secondary level product categories

df["primary_category"] = get_nth_category(df, level=1)
df["secondary_category"] = get_nth_category(df, level=2)

In [None]:
print(df.primary_category.head(5), "\n\n")
print(df.secondary_category.head(5))

In [None]:
# check missing values in the product's ratings column

print("Missing value percentage", "\n\nProduct rating: ", round(df.product_rating.isnull().sum()*100/df.shape[0], 2), "%",
      "\nOverall rating: ", round(df.overall_rating.isnull().sum()*100/df.shape[0], 2), "%")

In [None]:
# groupby using primary_category

groupby_df = pd.DataFrame(df.groupby("primary_category").agg({
    "discount_percent": [np.mean],
    "primary_category": ["count"]
}))

groupby_df.columns = ["_".join(col) for col in groupby_df.columns]
groupby_df = groupby_df.sort_values(by = ["primary_category_count"], ascending=False)
groupby_df = groupby_df[groupby_df.primary_category_count > 80]

In [None]:
groupby_df

In [None]:
# reset index to flatten column names as output by the groupby object

groupby_df.reset_index(inplace=True)

In [None]:
print(groupby_df.head())
print(groupby_df.info())
print(groupby_df.describe())

In [None]:
# product category vs product count

sns.barplot(data=groupby_df.sort_values(["primary_category_count"], ascending=False),
            y="primary_category", x = "primary_category_count")
plt.xlabel("Number of products")
plt.ylabel("Product Category")


In [None]:
# product category vs category discounts

sns.barplot(data=groupby_df.sort_values(by = ["discount_percent_mean"], ascending=False),
            y="primary_category", x = "discount_percent_mean")
plt.xlabel("Mean Discount Percentage")
plt.ylabel("Product Category")


The above plot does not tell us about the variance of the discount_percent of different products withing a product category. Let's plot a violinplot to see the discount_percent in a more comprehensive way. We will subset the original dataframe such that it only contains the top 20 occurring categories (categories from the groupby_df).

In [None]:
def is_top_category(x):
    """return 1 if x is one of the top categories"""
    if x in list(groupby_df.primary_category):
        return 1
    else:
        return 0
    
df["is_top_category"] = df.primary_category.apply(is_top_category)

In [None]:
# subset df such that it only contains top 20 occurring categories
top_categories = df[df.is_top_category == 1]

plt.figure(figsize = [15,7])
sns.violinplot(data=top_categories, x = "primary_category", y = "discount_percent")
plt.ylabel("Discount Percentage")
plt.xlabel("Primary Product Categories")
plt.xticks(rotation=45)

The above plot gives a good idea of the discounts available across different products withing each category. You can play around and analyse the secondary and tertiary categories as well!

**HELP**! If anyone has any idea how to replace the "[]" in the secondary category with any other value such as "EMPTY_LEVEL". I tried it inside the *get_nth_category* function, but it did not work.

Share and **upvote** if you liked my work! Nothing is more valuable than your upvote and suggestions.