In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
plt.__version__

AttributeError: module 'matplotlib.pyplot' has no attribute '__version__'

In [20]:
 !pip show matplotlib.pyplot



In [None]:
df = pd.read_csv("myntra_data.csv")
df.head()

# Data Cleaning
1. Check duplicate rows and remove them and reset index after removing them.
2. Check null values and remove them.
3. Drop extra columns which are not useful for analysis.
4. Add new column product id using information from product link.
5. Remove common rows having same product id (since there is not much difference except other row has one extra rating count)

In [None]:
df.shape

In [None]:
df = df.drop_duplicates().reset_index()

In [None]:
df.shape

In [None]:
df.isnull().sum()

There is no null values in the given data. 

In [None]:
df.describe()

In [None]:
df = df.drop(["img_link", "brand_tag", "index"], axis = 1)

In [None]:
df.shape

In [None]:
df["product_link"] = df["product_link"].str.split("/")

In [None]:
df["product_link"][0]

In [None]:
list = [df["product_link"][i][3] for i in range(len(df["product_link"]))]

In [None]:
product = pd.Series(list)
df["Product_id"]= product.values

In [None]:
df =df.loc[:,["Product_id","product_name","brand_name","rating","rating_count","marked_price","discounted_price","discount_amount","discount_percent","sizes","product_link","product_tag"]]

In [None]:
df = df.drop("product_link", axis = 1)

In [None]:
df.value_counts("Product_id")

In [None]:
df[df["Product_id"]=="14984314"]

In [None]:
df.shape

In [None]:
df = df.drop_duplicates("Product_id")

In [None]:
df.shape

In [None]:
df.info()

# Univariate Analysis

In [None]:
df.head(2)

In [None]:
df.nunique()

Product_name, brand_name, sizes and product_tag are categorical columns and remaining ones are numerical.

In [None]:
df["product_name"].value_counts()

In [None]:
df_tag = df["product_tag"].value_counts().to_frame().reset_index().rename(columns = {"index":"product_tag", "product_tag":"count"})

In [None]:
df_tag[df_tag["count"]>1000].plot(kind = "bar", x = "product_tag")

In [None]:
df["sizes"].value_counts()

In [None]:
df.head(2)

In [None]:
plt.figure(figsize = (12,4))
plt.subplot(1,2,1)
sns.distplot(df["rating"])
plt.subplot(1,2,2)
sns.histplot(df["rating"])
plt.show()

In [None]:
sns.distplot(df[df["rating"]!=0]["rating"])

In [None]:
plt.figure(figsize = (12,4))
plt.subplot(1,2,1)
sns.distplot(df["marked_price"])
plt.subplot(1,2,2)
sns.distplot(df["discount_amount"])
plt.show()

In [None]:
df["discount_percent"].value_counts()

# Analysis for brand

### Top 10 brands with highest number of products in the dataset

In [None]:
df["brand_name"].value_counts().head(10).plot(kind = "bar")

###  Find each brand average rating.

In [None]:
df_new = df
df_new["rating_prod"] = df_new["rating"]*df_new["rating_count"]
brand_rating=df_new.groupby("brand_name")["rating_prod","rating_count"].sum().reset_index()

In [None]:
brand_rating["average_rating"]=brand_rating["rating_prod"]/brand_rating["rating_count"]

In [None]:
brand_rating = brand_rating.drop("rating_prod", axis =1)

In [None]:
brand_rating

In [None]:
brand_rating["average_rating"].isnull().sum()

In [None]:
brand_rating.dropna(subset = ["average_rating"], inplace = True)

### Popular brand

In [None]:
brand_rating.sort_values(["rating_count","average_rating"], ascending = False)

From this we can conclude, Roadster is the most selling brand.

In [None]:
sns.barplot(data = brand_rating.sort_values(["rating_count","average_rating"], ascending = False).head(15), x = "brand_name", y = "rating_count")
plt.xticks(rotation =90)
plt.show()

In [None]:
sns.barplot(data = brand_rating.sort_values(["rating_count","average_rating"], ascending = False).head(15), x = "brand_name", y = "average_rating")
plt.xticks(rotation =90)
plt.show()

### How many brands have average rating greater than 4 or 4.5?

In [None]:
bins = (0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5)
brand_rating["average_rating"].value_counts(bins = bins).plot(kind = "pie", autopct = "%.2f")

76% of brands have average rating greater than 4.

In [None]:
brand_rating[brand_rating["average_rating"]<2]

These are the brands which have rating less than 2 and we can see that they have very less rating_count so not many people bought products from these brands.

In [None]:
df["brand_name"].value_counts()

In [None]:
df[df["rating_count"]==0.0]["brand_name"].value_counts()

In [None]:
len(df[df["brand_name"]=="max"])

We can conclude, Max is the brand which has maximum number of products which does not have any rating. That means max have maximum non-selling products.

### FInd the brand market size

In [None]:
df_new = df
df_new["sales_value"] = df_new["discounted_price"]*df_new["rating_count"]
brand_marketsize=df_new.groupby("brand_name")["sales_value","rating_count"].sum().reset_index()

In [None]:
brand_marketsize.sort_values("sales_value", ascending = False)

In [None]:
sns.barplot(data = brand_marketsize.sort_values("sales_value", ascending = False).head(10), x= "brand_name", y = "sales_value")
plt.xticks(rotation = 90)
plt.show()

Further questions which we can work on

(i) Which brands give maximum discount?


### Relationship between sales value and discount_percent for brands

In [None]:
df_new = df
df_new["sales_value"] = df_new["discounted_price"]*df_new["rating_count"]
brand_discounts_sales = df_new.groupby("brand_name").agg({'discount_percent': 'mean', 'sales_value': 'sum'}).reset_index()

In [None]:
brand_discounts_sales

In [None]:
sns.scatterplot(data = brand_discounts_sales, x="discount_percent" , y= "sales_value")

Increase in discount percent in brand does not imply increase in sales value.

In [None]:
sns.barplot(data = brand_discounts_sales.sort_values("discount_percent", ascending = False).head(20), x="brand_name" , y= "discount_percent")
plt.xticks(rotation = 90)
plt.show()

# Analysis for product

In [None]:
prod_name = input()
df_new2 = df[df['product_name'].str.contains(prod_name, case = False)]
max_price = df_new2[df_new2["discounted_price"] == df_new2["discounted_price"].max()]
min_price = df_new2[df_new2["discounted_price"] == df_new2["discounted_price"].min()]
max_rating = df_new2[df_new2["rating"] == df_new2["rating"].max()]
print("Details of product " + prod_name + " having maximum price")
display(max_price)
print("Details of product " + prod_name + " having minimum price")
display(min_price)
print(" Maximum rating product " + prod_name + " details")
display(max_rating)

### Product market size, average rating and discount percent

In [None]:
df_new = df
df_new["sales_value"] = df_new["discounted_price"]*df_new["rating_count"]
df_new["rating_prod"] = df_new["rating"]*df_new["rating_count"]
product_discounts_sales = df_new.groupby("product_tag").agg({'discount_percent': 'mean', 'sales_value': 'sum', "rating_prod":"sum", "rating_count":"sum"}).reset_index()

In [None]:
product_discounts_sales["average_rating"] = product_discounts_sales["rating_prod"]/product_discounts_sales["rating_count"] 
product_discounts_sales = product_discounts_sales.drop("rating_prod", axis = 1)

In [None]:
product_discounts_sales 

In [None]:
sns.barplot(data = product_discounts_sales.sort_values("sales_value", ascending = False).head(10), x = "product_tag", y = "sales_value" )
plt.xticks(rotation = 90)
plt.show()

In [None]:
sns.barplot(data = product_discounts_sales.sort_values("average_rating", ascending = False).head(10), x = "product_tag", y = "average_rating" )
plt.xticks(rotation = 90)
plt.show()

In [None]:
plt.figure(figsize = (10,4))
plt.subplot(1,2,1)
sns.scatterplot(data = product_discounts_sales, x="discount_percent" , y= "sales_value")
plt.subplot(1,2,2)
sns.scatterplot(data = product_discounts_sales, x="average_rating" , y= "sales_value")

### How does the average discount percent vary across different product tags?

In [None]:
sns.distplot(product_discounts_sales["discount_percent"] )

### Are there any correlations between the rating, discount percent, and sales value of the products?

In [None]:
product_discounts_sales.corr() 

In [None]:
sns.heatmap(product_discounts_sales.corr(), cmap = "coolwarm" )

### Which brand has the highest average rating among different products?

In [None]:
df.head(2)

In [None]:
df_prod_brand = df.groupby(["product_tag", "brand_name"]).sum()

In [None]:
df_prod_brand["avg_rating"] = df_prod_brand["rating_prod"]/df_prod_brand["rating_count"]

In [None]:
df_prod_brand = df_prod_brand.reset_index()

In [None]:
df_prod_brand

In [None]:
df_prod_brand["product_tag"].unique()

In [None]:
df_prod_brand[df_prod_brand["product_tag"]=="shirts"][["brand_name","avg_rating"]].sort_values("avg_rating", ascending = False).head(20)

In [None]:
df_prod_brand_highest = df_prod_brand.groupby("product_tag")["brand_name","avg_rating"].max().reset_index()

In [None]:
df_prod_brand_highest

In [None]:
df_prod_brand_highest[df_prod_brand_highest['product_tag'].str.contains("shirt", case = False)]

In [None]:
df_prod_brand_highest[df_prod_brand_highest["brand_name"] == "Puma"]