In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

main_csv = '/kaggle/input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv'
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv(main_csv)

from collections import Counter

Counter(df['badge_fast_shipping'])

In [None]:
df.describe()

# * Plotting histograms for several stats

In [None]:
%matplotlib inline
fig, axs = plt.subplots(3, 1, figsize=(10,10))

fig.suptitle("KDE and histogram of the several variables", fontsize=20)
plt.xlabel('Bins of values', color='white', fontsize=14)
plt.ylabel('Density', color='white', fontsize=14)

sns.set_style('whitegrid')

hist_columns = ['price', 'retail_price', 'shipping_option_price']

N = 100

for i in range(len(hist_columns)):
    name = hist_columns[i]
    arr = df[name]
    
    ax = axs[i]
    
    ax.set_title(name)
    sns.distplot(arr, kde=True, color='g', ax=ax).set(xlim=(-3,25))
    plt.plot()
    
    ax.set_xlim([0,30])
    
fig.tight_layout(pad = 3.0)

In [None]:
sns.distplot(df['units_sold'], kde=True, color='g')
plt.show()

In [None]:
g = sns.PairGrid(data=df, vars = ['price', 'retail_price'], hue='units_sold', height=4)
g.map(plt.scatter)
g.add_legend()
plt.show()

In [None]:
sns.jointplot(x='price', y='retail_price', data=df, kind='kde', xlim=(0,20), ylim=(-5,35), height=10)
plt.show()

In [None]:
df['price_drop'] = df['retail_price'] - df['price']
plt.scatter(df['price_drop'], df['units_sold'], alpha=.3)
plt.xlabel('price drop')
plt.ylabel('units sold')

In [None]:
##### ONLY NUMERIC VALUES:
numeric_cols = df.describe().columns
df_numeric = df[numeric_cols]
df_numeric

In [None]:
n = 100
df_sorted_units = df_numeric.sort_values(by='units_sold', ascending=False)
top_n = df_sorted_units.head(n)

# Max-min Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_new = scaler.fit_transform(df_sorted_units)
numeric_cols = df.describe().columns
df_sorted_units = pd.DataFrame(df_new, columns=numeric_cols)

# Quantile Analysis

In [None]:
n_quantiles = [top_n]
for i in range(1, len(df)//n):
    n_quan = df_sorted_units.iloc[ (len(df)- n*(i+1)) : -n*i]
    n_quantiles.append(n_quan)

In [None]:
qms, qss = [],[]
qms_r, qss_r = [], []
for quan in n_quantiles:
    quan_mean = quan.describe().loc['mean'][['price', 'retail_price']]
    quan_std = quan.describe().loc['std'][['price', 'retail_price']]
    
    qms.append(quan_mean['price'])
    qss.append(quan_std['price'])
    
    qms_r.append(quan_mean['retail_price'])
    qss_r.append(quan_mean['retail_price'])

Price & retail price doesnt seem to make much of a difference between the top rankers and the lowest rankers

In [None]:
print('Mean of mean discounted price across 15 quantiles:', np.mean(qms), '\n Standard deviation of the mean discounted price:', np.std(qms))
print('\n')
print('Mean of mean retailed price across 15 quantiles:', np.mean(qms_r), '\n Standard deviation of the mean retailed price:', np.std(qms_r))

In [None]:
numeric_cols = df.describe().columns

Apply the previous test to figure out what differentiates the quantiles
- Such features have high stdev between means of the quantiles

Small note:
- For np functions like np.mean ad np.std, axis=0 -> operation by column, and axis=1 -> operation by row

In [None]:
import warnings
warnings.filterwarnings('ignore')

 #take only numeric cols

def feature_search(df, search_feature, n, with_qr=True):
    '''
    Search for features in the given DataFrame that differentiates quantiles based on some value
    '''
    features = df.describe().columns
    print(features)
    sorted_arr = df.sort_values(by=search_feature)[features].to_numpy()
    # Turn this into numpy
    # Consider making them into arrays, then make column as features and the row as the quantile means
    # We're interested in the stdev of the quartile means
    # 1) Find feature mean for each quartile
    
    fq_means = np.array([None]*len(features)) #placeholder for feature means on each quartile
    
    while sorted_arr.shape[0] > n:
        quan = sorted_arr[:n, :] #take the first n
        sorted_arr = np.delete(sorted_arr, slice(n), 0) #remove the bottom n from the sorted arr (free up some mems)
        quan_range = np.max(quan, axis=0) - np.min(quan, axis=0)
        if with_qr:
            quan_feature_means = np.mean(quan, axis=0)*quan_range #yields 1D array of features' means in this quantile MULTIPLIED by the range of the quantile
        else:
            quan_feature_means = np.mean(quan, axis=0)
        fq_means = np.vstack((fq_means, quan_feature_means)) # stack it on previous
        
    fq_means = np.delete(fq_means, 0, 0).astype('float32') # delete the first one, and convert to float32 for numpy ops
    
    # 2) Get np.nanstd(, 0) (by column)
    #features_stdevs = np.std(fq_means, axis=0)
    features_std = np.nanstd(fq_means, axis=0)
    features_mean = np.nanmean(fq_means, axis=0)
    std_to_mean = features_std / features_mean
    return std_to_mean

In [None]:
feature_search(df_sorted_units, 'units_sold', 10, with_qr=False)

In [None]:
### WITHOUT QUAN RANGE
std_mean_ratio = feature_search(df_sorted_units, 'units_sold', 10, with_qr=False)
fv_units_sold = {numeric_cols[i]: std_mean_ratio[i] for i in range(len(numeric_cols))}
print('Below is the measure of sensitivity (stdev to mean) for numeric features for quantiles created for the units_sold column: \n', fv_units_sold)

This shows a general variability of features across 10 quantiles of products split / ranked via the units sold.

It seems like discounted price exhibits the least amount of variability across units_sold quantiles ==> price doesn't separate the top sold products against other undersellers

Ratings are also mostly invariable with units_sold ==> doesn't separate top sellers with others

Merchant's rating also doesn't matter that much.

Most differentiating feature seems to be:
- Merchant getting badge for fast shipping
- Rating counts (maybe that the more rating the person sees, the more trustable it is)
    - What about the actual rating? Well, the mean rating across all products don't really vary that much (3.82 mean with 0.52 stdev)
- Express shipping

HOWEVER, some of these features are binary data, so it's easy to get variability => not trustable


Augmenting the variability to consider the range of the dataset

This is done by **multiplying the feature range in a quantile to its variability ratio**

*This is equivalent to multiplying each data piece in the quantile by the range of the feature of that quartile, diminishing features with little range because it would be easy to get high variability with just a small change in the stdev.*

In [None]:
#### WITH QUAN RANGE
std_mean_ratio_quan = feature_search(df_sorted_units, 'units_sold', 10, with_qr=True)
fv_units_sold_quan = {numeric_cols[i]: std_mean_ratio_quan[i] for i in range(len(numeric_cols))}
print('Below is the measure of sensitivity (stdev to mean) for numeric features for quantiles created for the units_sold column: \n', fv_units_sold_quan)

# Top 5 most sale quantile-sensitive features (i.e products' features that varies the most with products' sale):

In [None]:
top_5 = sorted(fv_units_sold_quan, key=fv_units_sold_quan.get, reverse=True)[:10]

for feature in top_5:
    print(feature, fv_units_sold_quan[feature])

# Feature ratio between the two feature variabilities, that with and without multiplying the quantile range (to see how much the quantile range mattered for each feature):

In [None]:
def range_matter_func(fv_units_sold_quan, fv_units_sold):
    range_matter = {}

    for name in fv_units_sold:
        range_matter[name] = fv_units_sold_quan[name] / fv_units_sold[name]

    del range_matter['has_urgency_banner']

    sort_by_rm = sorted(range_matter, key=range_matter.get, reverse=True)
    top10_by_rm = sort_by_rm[:10]
    top10_by_rm

    for feature in top10_by_rm:
        print(feature,"| range_matter: ", range_matter[feature])

In [None]:
range_matter_func(fv_units_sold_quan, fv_units_sold)

Conclusion:
- Range does have quite an effect on the variability of the data.

# Some more chart analysis plotting units_sold against all other features

In [None]:
for feature in numeric_cols:
    plt.scatter(df_sorted_units['units_sold'], df_sorted_units[feature], alpha=0.03)
    plt.ylabel(feature)
    plt.xlabel('units_sold')
    plt.show()

- Inventory total is more for the least unit sold ==> they couldn't sell all which makes sense
- A familiar pattern: 
    - ***Price, retail_price converges for top sellers*** ==> we see market price here, because top sellers don't compete about price, while ***bottom sellers place price that might be not beneficial to them.*** (THIS IS EXACTLY LIKE A DEMAND CURVE)
    - The amount of countries shipped to: surprisingly, **top sellers hone in on some countries**, while bottom sellers tend to ship literally everywhere.
        - It would be nice to view which countries are these referring to, but dataset doesn't have it so...