# Step by step EDA and statistical analysis

This is work in progress...

If there are any questions, comments, suggestions feel free to point out :)

## Importing libraires

In [None]:
import numpy as np 
import pandas as pd 
import math

import matplotlib.pyplot as plt 
import seaborn as sb
from matplotlib import style
# style.use('fivethirtyeight')
style.use('ggplot')

import plotly.express as px
import plotly.graph_objects as go

import os 

## Data cleaning

In [None]:
DATADIR = '../input/summer-products-and-sales-in-ecommerce-wish'
os.listdir(DATADIR)

In [None]:
df = pd.read_csv(DATADIR + '/summer-products-with-rating-and-performance_2020-08.csv')

In [None]:
df.info()

In [None]:
df.iloc[1]

Theme, crawl month contains only one unique value

In [None]:
df.isna().sum()

In [None]:
def plot_missing_data(df):
    columns_with_null = df.columns[df.isna().sum() > 0]
    null_pct = (df[columns_with_null].isna().sum() / df.shape[0]).sort_values(ascending=False) * 100
    plt.figure(figsize=(8,6));
    sb.barplot(y = null_pct.index, x = null_pct, orient='h')
    plt.title('% Na values in dataframe by columns');

In [None]:
plot_missing_data(df)

**Lets take a look at the columns with more than 50% null values and check whether they are useful or not**

In [None]:
df['merchant_profile_picture'].value_counts()

merchant_profile_picture contains the url to merchants profile picture where more than 80% of data is missing, for now let's check the others

In [None]:
print("Unique values: ", df['has_urgency_banner'].unique())
print("Value counts: ", df['has_urgency_banner'].value_counts())

has_urgency_banner is a binary column which tells us whether the product has an urgency banner or not, so we can replace nan's with 0's to get rid of the nan's 

In [None]:
df['has_urgency_banner'] = df['has_urgency_banner'].replace(np.nan,0)
print("Unique values: ", df['has_urgency_banner'].unique())
print("Value counts: ", df['has_urgency_banner'].value_counts())

In [None]:
df['urgency_text'].unique()

'Quantité limitée !' translates to 'Quantity Limited' and 'Réduction sur les achats en gros' means 'discount on wholesale purchases' rest are all nan, let's make them right

In [None]:
df['urgency_text']=df['urgency_text'].replace({'Quantité limitée !':'QuantityLimited',
                                               'Réduction sur les achats en gros':'WholesaleDiscount',
                                               np.nan:'noText'})
print(df['urgency_text'][:5])
print(df['urgency_text'].value_counts())

Now let's process the columns with ratings, all the rating count columns has same number of values missing i.e. 45 missing values but the rating_count column has no na values, lets check the ratin_count where values are mising in other rating count columns

In [None]:
rating_columns = ['rating_one_count','rating_two_count','rating_three_count','rating_four_count','rating_five_count']
df[rating_columns] = df[rating_columns].fillna(value=-1)

In [None]:
df.loc[df['rating_five_count']==-1,'rating_count'].value_counts()

all values in the rating_count column are 0 where there are na values in other rating count columns so lets fill 0 in place of the na values

In [None]:
df[rating_columns]=df[rating_columns].replace(-1,0)

Let's move to the remaining columns product_variation_size_id, merchant_name, merchant_name_info_subtitle, origin_country

In [None]:
print(df['origin_country'].unique())
print(df['product_color'].unique())
print(df['product_variation_size_id'].unique())
print(df['merchant_name'].unique())
print(df['merchant_info_subtitle'].unique())

All data is categorical, so we can replace the nan values with an unknown token 'Unknown'

In [None]:
nan_cat_cols = ['origin_country','product_color','product_variation_size_id','merchant_name','merchant_info_subtitle']
df[nan_cat_cols] = df[nan_cat_cols].replace(np.nan,'Unknown')

Now we can check if we handled the nan values properply

In [None]:
df.columns[df.isna().sum()>0]

All null values were handled except for the 'merchant_profile_picture' which we will look after later

Let's check for identical rows and eliminated them for the dataframe

In [None]:
df.duplicated().sum()

In [None]:
df= df.drop_duplicates()
df.duplicated().sum()

Now for products a unique identifier is product_id and same goes for merchant_id 

In [None]:
print("Duplicate product_id :",df['product_id'].duplicated().sum())

This represents that even after removing identical values still there duplicates of 'product_id' are present, this is because same product can be soold by different merchants on different price, hence this seems to a clear indicator that merchant_id influences the price of product

## Exploratory data analysis

In [None]:
df.describe().T

In [None]:
plt.figure(figsize=(12,6))
sb.distplot(df['price'], color='red', label='Price')
sb.distplot(df['retail_price'], color='blue', label='Retail price')
plt.legend();

The plot indicates a right skewed distribution but not very Clear.

In [None]:
kwargs = {'cumulative':True}
f, axes = plt.subplots(1,2, figsize=(14,6))
f.suptitle('CDF of Price and Retail Price')
sb.distplot(df['price'].values,kde_kws=kwargs, hist_kws=kwargs, color='red', label='Price', ax=axes[0]);
sb.distplot(df['retail_price'].values,kde_kws=kwargs, hist_kws=kwargs, color='blue', label='Retail Price', ax=axes[1]);
axes[0].set(xlabel='Price');
axes[1].set(xlabel='Retail Price');

CDFs are more useful inorder to visualize the data more efficiently.
* CDF of price reveals that 97% of products are listed for less than aproximate price 19-20
* CDF of Price closely represents the CDF curve of Normal distribution which can be summarized efficiently except for the 3% data
* Incase of Retail price the distribution is not very much smooth and contains price gaps

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(x=df['retail_price'], name='Retail Price'))
fig.add_trace((go.Box(x=df['price'], name='Price')))
fig['layout']['title'] = 'Distribution of Price and Retail Price'
fig.show()

With Boxplots we can easily spot the outliers and quartiles
* The Upper fence of Price is at 18 i.e most of the data is priced less tha 18
* There an item wiht price of 49 i.e clearly an oulier as it is far away from the Inter Quartile Range (Q3 - Q1) 
* Box plot of Retail price is much more spread out,  there is huge difference of 195 between the upper fence and max data point

Let's now explore these outliers

In [None]:
df_outliers = df[df['price'] > 18]
print("Number of outliers: ",df_outliers.shape[0])
print("Outlier: ", df_outliers[df_outliers['price']==49])

This one is a premium item with price much higher than the average price and number of units sold are very low compared to average units sold (4422), with an above average rating of 4.67 and rating density of 6% as only 6 poeple posted a review out of 100

In [None]:
px.scatter(df, x='units_sold', y='price',marginal_x='box', title='Price vs Units Sold')

There a clear relationship of price and units sold.
* Higher the price lesser the units sold
* There are some cases where the price is low still the units sold are below average, possible reasons the product might not be upto the mark as per the buyers or there are some other factors affecting the price we haven't touched yet
* units sold seems be a range not continous values 
* median of units sold is 1000, by this we can consider that products with units sold below 1000 (inclusive) were below average and products with units sold are very successfull.
* It totally depends on your business goals which price range you want to focus on, for now I will take the unsupervised approach

In [None]:
#range for units sold
sorted(df['units_sold'].unique())

In [None]:
from sklearn.cluster import KMeans

clusters = {}
for i in range(1,8):
    kmeans = KMeans(n_clusters=i).fit(df[['units_sold']])
    clusters[i] = kmeans.inertia_
    
plt.plot(list(clusters.keys()), list(clusters.values()));
plt.xlabel('no. of clusters');
plt.ylabel('kmeans inertia');   

By performing clustering we can see that units_sold can be clustered in 3 categories (optimal) as the inertia curve smooths out after 3 clusters

In [None]:

#order cluster method
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

In [None]:
df['units_sold_cluster'] = KMeans(n_clusters=3).fit(df[['units_sold']]).predict(df[['units_sold']])
df = order_cluster('units_sold_cluster','units_sold',df,True)
df.groupby(['units_sold_cluster'])['units_sold'].describe()

now we have a clear picture of top selling, and price range of products

In [None]:
px.scatter(df,x='units_sold',y='rating', color='units_sold_cluster', marginal_y ='box',title='Rating vs units sold')

* Median for rating is 3.85 and the products in top selling cluster has rating between 3.35 to 4.1 seems very reasonable
* Rating is very important to determine the potential of product
* Still there are some products with 5 star rating yet unable to cross the 100-1000 unit sold line
* there are some really bad performing products with rating below 3

In [None]:
px.scatter(df,x='rating',y='merchant_rating', color='units_sold_cluster', marginal_y ='box',title='Merchant Rating vs units sold', opacity=0.7)

In [None]:
px.scatter(df,x='rating', y='product_variation_inventory', color='units_sold_cluster', title='Product variation vs Rating')

In [None]:
fig = px.scatter(df,x='rating_count',y='rating', color='units_sold_cluster', title='Rating vs Rating count')
fig.add_trace(go.Scatter(x=np.ones((len(df)))*1103,y=df['rating'],name='Threshold 1'))
fig.add_trace(go.Scatter(x=np.ones((len(df)))*7773, y=df['rating'],name='Threshold 2'))
fig.update_layout(showlegend=False)

From above visualization we can conclude that products sold by merchants belonging to cluster 2 and 1 are Top selling,most liked and trusted by buyers
* There's some kind of thresholding that can be done on rating and rating count to separate the 3 categories of products
* still there are few overlapping data points

In [None]:
px.scatter(df,x='retail_price', y='price',color='units_sold_cluster',marginal_y='box')

Most of the top selling products seems be concentrated to the left where the price difference is much siginificant

In [None]:
px.scatter(df, x='price', y='shipping_option_price', color= 'units_sold_cluster', title='Shipping price vs Price')

People always prefer paying less shipping charges 
we can see that most selling products has low shipping charges

In [None]:
features= ['price','retail_price','units_sold','rating','rating_count','shipping_option_price','product_variation_inventory','merchant_rating','merchant_rating_count']
corr = df[features].corr(method='spearman')

In [None]:
plt.figure(figsize=(15,8));
sb.heatmap(corr,annot=True);

Let's take a look at other binary attributes which might affect the sales of product

In [None]:
df['uses_ad_boosts'].value_counts()

In [None]:
df.groupby(['uses_ad_boosts'])['units_sold'].describe()

Consider these two groups of products one uses ad boost other dosen't
* There is very small difference between the means of the two groups
* Does using ad boost results in more success of products
* How big the difference is bwetween these two two groups?
* Is the effect statistically significant?

In [None]:
data = df.query('uses_ad_boosts == 0')['units_sold'].values, df.query('uses_ad_boosts == 1')['units_sold'].values

Checking how big the effect is between two groups

![Cohen's d](https://tien-nguyen.github.io/images/cohen-d.png)

In [None]:
# calculating the effect size 

def EffectSize(group1, group2):
    diff = group1.mean()- group2.mean() 
    var1 = group1.var()
    var2 = group2.var()
    n1,n2 = len(group1), len(group2)
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    d = diff/math.sqrt(pooled_var)
    return d

In [None]:
g1,g2 = data
print("Difference in means: ",g1.mean()-g2.mean())
EffectSize(g1,g2)

The effect size is too small to make any difference

Let's define a null hypothesis that there is no effect of using ad boosting on units sold with threshold of 0.05

In [None]:
class HypothesisTest(object):
    
    def __init__(self, data):
        self.data = data
        self.MakeModel()
        self.actual = self.TestStatistic(data)
        
    def PValue(self, iters=1000):
        self.test_stats = [self.TestStatistic(self.RunModel()) for _ in range(iters)]
        
        count = sum(1 for x in self.test_stats if x > self.actual)
        return count/iters
    
    def TestStatistic(self, data):
        raise UnimplementedMethodException()
    def MakeModel(self):
        pass
    def RunModel(self):
        raise UnimplementedMethodException()

In [None]:
class DiffMeans(HypothesisTest):
    def TestStatistic(self, data):
        group1,group2 =data
#         test_stat = abs(group1.mean() - group2.mean())
        test_stat = abs(EffectSize(group1, group2))
        return test_stat
    def MakeModel(self):
        group1, group2 = self.data
        self.n, self.m = len(group1), len(group2)
        self.pool = np.hstack((group1,group2))
        
    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data
    

In [None]:
test = DiffMeans(data)
test.PValue()

* The resulting P-value is much greater than the threshold we set hence we accept the null hypothesis that there is no statistically significant effect of ad boosting
* It is possible that there might be an significant effect but we didn't see it here because of less amount of data

Lets see discount applied

In [None]:
df['difference'] = df['retail_price'] - df['price']
df['discount'] = df['difference']/df['retail_price'] *100
plt.figure(figsize=(12,6))
sb.distplot(df['discount']);
plt.title('Distribution of Discount');

* Here we can there are some products which were sold at discount of 95%(thats some crazy sale going on)
* There are products which were sold at price more than the retail price, must some products with huge demand as people are buying it at greater prices

In [None]:
px.scatter(df,x='discount', y='rating_count', color='units_sold_cluster')

There seems to be no specific relation between units sold and discount provided 

In [None]:
df['rating_score'] = df['rating']*df['rating_count']
df['rating_score'] =df['rating_score']/df['rating_score'].max()
plt.figure(figsize=(12,6))
sb.distplot(df['rating_score']);
plt.title('Distribution of Rating Score');

In [None]:
px.scatter(df,x='rating_score',y='units_sold', color='units_sold', title='Units Sold vs Rating score')

* By combining the product rating and rating count we defined a metric rating score which seems to be very useful to understand the sales of a product

In [None]:
def make_clusters(df,column):
    clusters = {}
    for i in range(1,8):
        kmeans = KMeans(n_clusters=i).fit(df[[column]])
        clusters[i] = kmeans.inertia_

    plt.plot(list(clusters.keys()), list(clusters.values()));
    plt.title(f'{column} clusters')
    plt.xlabel('no. of clusters');
    plt.ylabel('kmeans inertia');   

In [None]:
make_clusters(df,'rating_score')

3 clusters for rating score will be optimal

In [None]:
kmeans = KMeans(n_clusters=3).fit(df[['rating_score']])
df['rating_score_cluster'] = kmeans.predict(df[['rating_score']])
df= order_cluster(df=df,cluster_field_name='rating_score_cluster',target_field_name='rating_score',ascending=True)
df.groupby('rating_score_cluster')[['rating','rating_count','units_sold']].describe().T

Now we have an even more clear picture of whats selling, liked by people and what are some below average products

In [None]:
df['overall_score'] = df['rating_score_cluster'] + df['units_sold_cluster']
make_clusters(df,'overall_score');

here 2 clusters seems to be optimal 

In [None]:
kmeans= KMeans(n_clusters=2).fit(df[['overall_score']])
df['overall_score_cluster'] = kmeans.predict(df[['overall_score']])
df = order_cluster(df=df,target_field_name='overall_score', cluster_field_name='overall_score_cluster', ascending=True)
df.groupby('overall_score_cluster')[['rating_score','price','units_sold']].describe().T

* With this overall score we have identified the groups of top selling, most liked products which are the ones generating high revenue and products performing below average
* There 213 successfull products with range of units sold from 10K to 100K at a mean price of 8.45
* In the other cluster the mean price is 8.34 but mean units sold are much low
* another thing to notice is that people prefer a reasonable price as in successfull cluster the max price is 19, products in this cluster must be worth the price.

In [None]:
df[['title_orig','units_sold','price','rating_score','units_sold_cluster','rating_score_cluster','overall_score_cluster']].sample(frac=.25).head(30)