In [None]:
#Importing required libraries
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
#Filtering warnings out
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Setting values for plots
#plt.rcParams['figure.figsize'] = (20,10)
plt.style.use('ggplot')

In [None]:
#import datasets
data = pd.read_csv('/kaggle/input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv')
cat = pd.read_csv('/kaggle/input/summer-products-and-sales-in-ecommerce-wish/unique-categories.csv')
cat_count = pd.read_csv('/kaggle/input/summer-products-and-sales-in-ecommerce-wish/unique-categories.sorted-by-count.csv')

# Data Exploration

In [None]:
data.info()

We have 42 features linked to a product. Our goal is to identilfy the features which can significantly affect the product sale success.

### Pricing

In [None]:
sns.kdeplot(data['price'],shade=True)
sns.kdeplot(data['retail_price'],shade=True)
plt.title('Price vs Retail Price')

This is interesting!!! Product prices sold in retail ranes upto 300, while the price in the webiste range maximum upto little above 50.
It should also be noted that majority of products-sold , both in stores/other places and the website, are priced between 0 to 30.[Discern the peak in both plots].

This may also implies that there is no much difference if you buy from the store or the website if the product price is less than 50.

### Currency

In [None]:
data['currency_buyer'].unique()

Only EUR currency has been delt in the site. Hence, currency type, more or less, have no effect on Product Sales.

### Ad-Boosts

In [None]:
sns.countplot('uses_ad_boosts',data=data)
plt.title('Ad-boosts')

In [None]:
corr = data['uses_ad_boosts'].corr(data['units_sold'])
print(f'The Correlation coeff between Ad-Boost and Units-Sold is {np.round(corr,4)}')

In [None]:
fig = sns.FacetGrid(data,hue='uses_ad_boosts',aspect=4)

fig.map(sns.kdeplot,'units_sold',shade=True)
max_units = data['units_sold'].max()
fig.set(xlim=(0,max_units))
fig.add_legend()
plt.title('Ad-boosts vs Sale')

The graph shows the distribution of Units-sold density on whether the ad-boost is given or not. Seems like there is no much anticipated effect on Product sale.

So, Ad-Boost has very least effect on Product sale. Which is absolutely un-expected!!!

### Rating

In [None]:
rating_df = DataFrame(data[['rating', 'rating_count',
       'rating_five_count', 'rating_four_count', 'rating_three_count',
       'rating_two_count', 'rating_one_count','units_sold']])
sns.heatmap(rating_df.corr(),cmap='CMRmap_r',annot=True)
plt.title('Rating vs Sale')

Two deductions can be made with this heatmap.
 1. Lower the rating, lower the number of units sold.
 2. Whatever the rating is [5,4,3,2,1] higher the number of times a product is rated, more the number of product sale.

***Important thing is Number of times a Product has been rated has WAY MORE IMPACT on product sale than rating itself.

### Badges

In [None]:
badges_df = DataFrame(data[['badges_count',
       'badge_local_product', 'badge_product_quality', 'badge_fast_shipping','units_sold']])
sns.heatmap(badges_df.corr(),cmap='CMRmap_r',annot=True)
plt.title('BAdges vs Sale')

Total Badge count has a positive correlation with the Sale of Product. 
Considering various types of badges, 'Local_product_badge' and 'Fast_shipping_badge' has negative correlation [interesting!!!!].
Product quality badge has positive relation and highest impact [expected!!].

### Product Characteristics

Let's check if the color and size have any impact on units sold.

#### Color

In [None]:
"""Creating a DF of Color vs Total-Units Sold"""
color = data[['product_color','units_sold']]
color_df = DataFrame(color.groupby(['product_color']).sum().sort_values('units_sold',ascending=False))
color_df.plot(kind='bar',figsize=(20,10))
plt.title('Color vs Sale')

The graph also implies that 80% of the sale happens in colors constituting 20% of avialable shades.

#### Size

In [None]:
"""Creating a DF of Size vs Total-Units Sold"""
size = data[['product_variation_size_id','units_sold']]
size_df = DataFrame(size.groupby(['product_variation_size_id']).sum().sort_values('units_sold',ascending=False))
size_df.plot(kind='bar',cmap='plasma',figsize=(20,10))
plt.title('Size vs Sale')

Likewise color, less than 10% of size variation are sold more.

In [None]:
"""Label-Encoding the color and size"""
from sklearn.preprocessing import LabelEncoder

In [None]:
def encoder(value):
    encode = LabelEncoder().fit(value)
    return (encode.transform(value))

color_df = color_df.reset_index()
size_df = size_df.reset_index()

color_df['product_color'] = encoder(color_df['product_color'])
size_df['product_variation_size_id'] = encoder(size_df['product_variation_size_id'])

In [None]:
print('The relation btw color and units sold:')
print(color_df.corr(),end='\n\n\n')
print('The relation btw size and units sold:')
print(size_df.corr())

As seen above, both color and size are positively correlated to Units Sold. Seems like size has more impact than color.

### Product inventory available

In [None]:
sns.lmplot('units_sold','product_variation_inventory',data=data)
plt.title('Variation Inventory vs Sale')

There is a positive relation between units_sold and product-variation inventory. 
lmplot implies, above 30 units, the sale shows a positive trend.

In [None]:
sns.lmplot('units_sold','inventory_total',data=data)
plt.title('Total inventory vs Sale')

Interestingly, product sale take place substantially, whenever the inventory range is at 50 but no below.

### Shipping

In [None]:
ship = data.groupby('shipping_option_name')['shipping_option_name'].count()
plt.pie(ship,radius=2)
plt.legend(ship.index,loc=(-0.9,0.3))
plt.title('Shipping Name???')

The bulk of the shipping is done by A company - Livraison Standard. Hence, it can be concluded that, it will have no effect on the sales.

In [None]:
ship = data.groupby('shipping_is_express')['shipping_is_express'].count()
lables = ship.index
plt.pie(ship,labels=lables,colors=['brown','pink'])
plt.legend(['0:No','1:Yes'])
plt.title('Express Shipping???')

In [None]:
sns.violinplot('shipping_option_price',data=data)
plt.title('Shipping Price')

We can see a range of price, where it is most preferred.
Let's look if any relation exists btw Shipping price and Units sold.

In [None]:
ship_price = data[['shipping_option_price','units_sold']]
sns.heatmap(ship_price.corr(),annot=True)
plt.title('Shipping price vs Sale')

It can conclude that, shipping price and units_sold are actually less correlated with a negative tendency.

In [None]:
plt.rcParams['figure.figsize'] = (20,10)
data['countries_shipped_to'].plot(kind='hist',color='purple')
plt.title(' # of Destination countries')

Most products were sold upto 41 to 42 countries.

In [None]:
sns.scatterplot('countries_shipped_to','units_sold',data=data)
plt.title('Destination Countries vs Sale')

As obvious a correlation exists in such a way that upto 20-40 number of countries_shipped_to, more units have been sold. But sale decreases with further increase of shipping countries.

In [None]:
print('A negative correlation exists between units sold and countries shipped with a value')
print(data['countries_shipped_to'].corr(data['units_sold']))

### Urgency

In [None]:
urgency = data[['has_urgency_banner', 'urgency_text','units_sold']]
urgency = urgency.replace(np.nan,0)

Let's check if an urgency banner always has a text.

In [None]:
urgency['has_urgency_banner'].count() == urgency['urgency_text'].count()

So, every urgency banner has an urgency text followed. Seems like it is mandatory to have a text.

How much correlated is the urgency banner to units sold?????

In [None]:
fig = sns.FacetGrid(urgency,hue='has_urgency_banner',aspect=4)

fig.map(sns.kdeplot,'units_sold')
x_max = urgency['units_sold'].max()
fig.set( xlim = (0,x_max))
fig.add_legend()
plt.title('Urgency banner vs Sale')

Whether or Not a Urgency banner is present doesn't have much impact on Units sold. See, how the graph moves similar to both the options.

In [None]:
c = urgency['has_urgency_banner'].corr(urgency['units_sold'])
print(f'The correlation between the two is {c}')

Negative value also shows that, absence of an urgency banner might negatively impact [even though very slightly] the units sold.

### Merchant Details

In [None]:
merchant = data[['origin_country', 'merchant_title', 'merchant_name',
       'merchant_info_subtitle', 'merchant_rating_count', 'merchant_rating',
       'merchant_id', 'merchant_has_profile_picture',
       'merchant_profile_picture','units_sold']]

For ease of analysis, [except for 'origin_country','merchant_rating_count','merchant_rating'], I am focussing on whether the presence of value has an impact on Sale.

In [None]:
merchant.info()

'Merchant_name','merchant_info_subtitle' and 'merchant_profile_picture' has nan values. Merchant_id seems to be mandatory to be mandatory.

In [None]:
sns.countplot('origin_country',data=merchant)
plt.title('Origin Country')

Bulk of the sale originates from CN. CN seems to be the majority supplier of the products in website.

In [None]:
for i in ['merchant_name','merchant_info_subtitle']:
    y = merchant[i].isna()
    merchant[i] = y.apply(lambda x : 0 if x else 1)

In [None]:
print('Correlation btw "Merchant name" and "Units Sold:"')
print(merchant['merchant_name'].corr(merchant['units_sold']),end='\n\n')
print('Correlation btw "merchant_info_subtitle" and "Units Sold":')
print(merchant['merchant_info_subtitle'].corr(merchant['units_sold']))

Presence of 'Merchant name' and 'merchant_info_subtitle' has very little impact but positive impact on Sales.

In [None]:
sns.boxplot(merchant['merchant_rating_count'],color='yellow',showmeans=True)
plt.title('# of Merchant ratings')

There a many outliers in the data. Let's do some cleaning. 
I am imputing mean values to the outliers.

In [None]:
merchant['merchant_rating_count'].describe().astype(int)

I am replacing all values above 24564 with mean value 26495.

In [None]:
mean = merchant['merchant_rating_count'].mean()
merchant['merchant_rating_count'] = merchant['merchant_rating_count'].apply(lambda x: mean if x>24564 else x)
sns.boxplot(merchant['merchant_rating_count'],color='yellow',showmeans=True)
fig = plt.gcf()
fig.set_size_inches(10,5)
plt.title('Rating Count')

In [None]:
sns.boxplot(merchant['merchant_rating'],color='green',showmeans=True)
fig = plt.gcf()
fig.set_size_inches(10,5)
plt.title('Rating')

Mean rating lies almost at 4.5 for the products.

Let's see the correlation between rating_count,rating and sale.

In [None]:
df1 = merchant[['merchant_rating_count','merchant_rating','units_sold']]
sns.heatmap(df1.corr(),annot=True)
fig = plt.gcf()
fig.set_size_inches(10,5)
plt.title('Rating vs Sale')

Figure implies that both rating count and rating are positively and heavily correlated to sale of a product.

In [None]:
merchant['merchant_has_profile_picture'] = merchant['merchant_has_profile_picture'].apply(lambda x : 'yes' if x==1 else 'no')
fig = sns.FacetGrid(merchant,hue='merchant_has_profile_picture',aspect=4)

fig.map(sns.kdeplot,'units_sold')
x_max = urgency['units_sold'].mean() # Considering the Mean sales level
fig.set( xlim = (0,x_max))
fig.add_legend()
plt.title('Merchant Profile pic vs Sale')

Note that, for sale of units below 2000, merchant profile picture has no impact on sales. But after the threshold, profile pic affects the sales.
This might indicate that, people prefer goods more structred and formal merchants when they are buying in bulk.

Conclusion:
    1. For price range less than 50, people prefer website than a retail store. Above 50, retail store is considered.
    2. Interestingly, Ad-boosts have no effect on Sales.[might need to check facts!!]
    3. Expected: High rating of product high the sale is. More the number of times a product is rated, more sale.
    4. More number of badges [irrespective of type], more the sale. Quality badge is soo much important in comparison to others.
    5. 80% of sale happens in 20% color and 6 sizes. [Design wisely!!!] 
    6. Inventory of above 30 units of Product variation shows a sales boost. And total inventory should be above 50 for safer sales.
    7. Bulk of shipping is done by one company. More than 90% sales don't prefer an express shipping. Preferred shipping price is in range of 4.Shipping to 20-60 countires seems safe for a good sales.
    8. No urgency banner, lower the sales. People prefer urgency banner!!
    9. Most product's origin country is CN.
    10.Both volume and quality of rating has positive impact on sales. Most sales happens at 2500 rating counts and average rating of 4.
    11.For sale of units above 2000 people prefer a merchant profile picture. Don't worry about a picture if you a intending to sale below 2000.