In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

<h1>Background</h1>
<p> We are provided sales information for a Chinese online retailer, Wish. For those unfamiliar with Wish, think of it as a discounted price version of Amazon. For our analysis, we will be focusing on clothing sales during the month of August. Specifically we will look to address the following quesitons:
    <ul>
        <li>Are sales sensitive to price drops?</li>
        <li>Do bad products sell?</li>
        <li>What is the relationship between product quality (as determined by ratings), price, and sales?</li>
    </ul>  
</p>

In [None]:
df=pd.read_csv('/kaggle/input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv')
df.head()

<h1>Data Exploration and Feature Engineering</h1>

Create a new column that extracts out the gender from the title. We will futher feature engineer this to a number later.

In [None]:
test_list = ["Men", "Men's", "Man","Man's"]
test = ["F" if "Women" in item else "M" for item in df['title_orig']]
test2 = ["F" if "Women's" in item else "M" for item in df['title_orig']]


df['Gender'] = ['F' if 'F' in (test[i] or test2[i]) else 'M' for i,x in enumerate(test)]

In [None]:
df.drop(columns=['title','currency_buyer','merchant_id','merchant_has_profile_picture','merchant_profile_picture','product_url','product_picture','product_id'], inplace=True)

The crawl month is just the month that the web crawler supplied the data. This isn't going to be useful for analysis since all months are 08/2020

In [None]:
df.drop(columns='crawl_month',inplace=True)

Feature engineer the urgency_text column to reflect whether or not the product had an urgent text or sale pitch associated with the sale.

In [None]:
print(df['urgency_text'].value_counts())
print(df['urgency_text'].isna().sum())

In [None]:
df['urgency_text'].replace(np.nan,'N',inplace=True)
df['urgency_text'].replace('Quantité limitée !', 'Y',inplace=True)
df['urgency_text'].replace('Réduction sur les achats en gros', 'Y', inplace=True)
df.drop(columns='has_urgency_banner',inplace=True)

In [None]:
df['origin_country'].value_counts()

Lets bin all the other countries together. Either from CN or from elsewhere.

In [None]:
df.replace(['US','VE','SG','GB','AT'],'Other',inplace=True)
df['origin_country'].value_counts()

In [None]:
df['inventory_total'] = ["Full" if ele == 50 else "Not Full" for ele in df['inventory_total']]

In [None]:
df.inventory_total.value_counts()

In [None]:
df[['merchant_title','merchant_name']][:15]

We can see that some merchants have repeat occurances of several key products. Not sure if this is useful at the moment, but we will create a feature to track repeat merchants and drop the title and name columns.

In [None]:
counts = pd.DataFrame(df['merchant_title'].value_counts())
df['repeat'] = ['Y' if counts.loc[ele][0] > 1 else "N" for ele in df['merchant_title']]
df.drop(columns=['merchant_title','merchant_name','merchant_info_subtitle'],inplace=True)
df.drop(columns='theme',inplace=True)
df.drop(columns='product_color',inplace=True)


Let's continue to scrape through the title_orig and tags features to extract the main type of item any given product is. Narrowing down the products to a finite grouping should prove more significant than sentence or list structures of words.

In [None]:
items = ['Shirt','Dress','Shorts','Pants','Skirt','Sweater']

In [None]:
clothes_test = [np.nan]*1573
for item in items:
    for ind,ele in enumerate(df['title_orig']):
        if clothes_test[ind] is np.nan and item in ele:
            clothes_test[ind]=item

In [None]:
clothes = pd.DataFrame(clothes_test)

In [None]:
print(clothes.value_counts())
print(clothes.value_counts().sum())

We've started creating a converted array that extracted the type of clothing from the original title. We still have ~500 NaN values remaining. Let's explore them and see if a common clothing article was missed while feature engineering.

In [None]:
df['Clothing'] = clothes

In [None]:
df[df['Clothing'].isna()][['title_orig','tags']]

I have clearly overlooked a few key categories. Lets re-create our items key list to include things like beachwear, swimsuit, romper, jumpsuit, t-shirts, and blouse. Let's also switch gears and try to iterate over the tags feature rather than the title this time.

In [None]:
items_round2 = ['beachwear', 'beach wear','swimsuit','romper','jumpsuit','t-shirts','blouse']
for item in items_round2:
    for ind,ele in enumerate(df['tags']):
        if clothes_test[ind] is np.nan and item in ele:
            clothes_test[ind]=item

In [None]:
clothes = pd.DataFrame(clothes_test)
df['Clothing'] = clothes

In [None]:
df['Clothing'].value_counts(dropna=False)

In [None]:
df[df['Clothing'].isna()][['title_orig','tags']]

We're definitely getting closer. There are a few more similar words or phrasings that we should be able to encapsilate before we re-bin some of the categories.

In [None]:
items_round3 = ['bikini', 'Bikini','T-shirt','Shorts','Vest','Tank','tank']
for item in items_round3:
    for ind,ele in enumerate(df['tags']):
        if clothes_test[ind] is np.nan and item in ele:
            clothes_test[ind]=item

In [None]:
clothes = pd.DataFrame(clothes_test)
df['Clothing'] = clothes
df['Clothing'].value_counts(dropna=False)

Down to 86! Not too bad. Let's stop there for now, re-group some of the categories and see what we've got.

In [None]:
df['Clothing'].replace(['T-shirt','t-shirts'], value='Shirt', inplace=True)
df['Clothing'].replace(['Vest','sweater','Sweater','blouse'], value='Blouse', inplace=True)
df['Clothing'].replace(['beachwear','Bikini', 'bikini', 'beach wear','swimsuit'], value='Swimsuit', inplace=True)
df['Clothing'].replace(['romper','jumpsuit'], value='Romper', inplace=True)
df['Clothing'].replace(['tank'], value='Tank', inplace=True)
df['Clothing'].replace(np.nan, value='Other', inplace=True)

df['Clothing'].value_counts(dropna=False)

In [None]:
df.drop(columns=['title_orig','tags'],inplace=True)

Let's run a correlation matrix to see where we stand with some of the inner-relation between some of the quantitative features.

Interesting to see that units sold has a higher correlation with the count of individual star reviews vice the overall rating. Let's go ahead and trim away a few more of the features that are likely leaks (such as shipping price to overall price).

In [None]:
drops= ['merchant_rating','merchant_rating_count','shipping_is_express','shipping_option_price','product_variation_inventory',
       'badge_fast_shipping','badge_product_quality','badge_local_product','badges_count','shipping_option_name']
df.drop(columns=drops,inplace=True)

In [None]:
discount = ((df['price']-df['retail_price'])/df['retail_price'])*-100

In [None]:
df['Discount']=discount
df[['price','retail_price','Discount','units_sold']]

In [None]:
corr = df.corr()
fig, ax = plt.subplots(figsize=(15, 10))
colormap = sns.diverging_palette(220, 10, as_cmap=True)
dropSelf = np.zeros_like(corr)
dropSelf[np.triu_indices_from(dropSelf)] = True
colormap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, cmap=colormap, linewidths=.5, annot=True, fmt=".2f", mask=dropSelf)
plt.title('Correlation Analysis');

Interestingly, it would appear that ratings hold far stronger correlation to unit sales than price. Although this might be construed as leaked data, we will consider it a feature since people often look at stars and ratings when assessing whether or not to buy a product.

Additionally, the percent discount doesn't appear to drive sales as much as originally anticipated. Let's transition into some more visualizations to delve deeper into these hypothesis.

In [None]:
df.isna().sum()

In [None]:
df['rating_five_count'].fillna(df['rating_five_count'].mean(), inplace=True)
df['rating_four_count'].fillna(df['rating_four_count'].mean(), inplace=True)
df['rating_three_count'].fillna(df['rating_three_count'].mean(), inplace=True)
df['rating_two_count'].fillna(df['rating_two_count'].mean(), inplace=True)
df['rating_one_count'].fillna(df['rating_one_count'].mean(), inplace=True)
df['origin_country'].fillna(df['origin_country'].mode()[0],inplace=True)
df.drop(columns='product_variation_size_id',inplace=True)

<h1>Data Visualization</h1>

In [None]:
plt.figure(figsize=(18,10))
sns.distplot(df.price, label="Sale Price")
sns.distplot(df.retail_price, label = "Retail Price")
plt.legend()
plt.xlabel("EUR")
plt.title("Retail and Sale Price Distributions")
plt.show()

Most items sold have a long tail of values for their retail price. However, notice that most of sale prices seem rather normally distributed. Let's take a look at the relative amounts of outliers for each batch compared to their own means and standard deviations.

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(x=df.price)
plt.xlabel("EUR")
plt.title("Price Distribution")
plt.show()
plt.figure(figsize=(15,5))
sns.boxplot(x=df.retail_price)
plt.title("Retail Price Distribution")
plt.xlabel("EUR")
plt.show()

Definitely confirms that there is a wide range of retail prices, but they are fairly well normalized once they are actually sold.

In [None]:
result_sold = df.groupby("Clothing")['units_sold'].sum().reset_index().sort_values(by='units_sold')
result_discount = df.groupby("Clothing")['Discount'].mean().reset_index().sort_values(by='Discount')

In [None]:
f,(ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.barplot(x='Clothing',y='units_sold',data=result_sold, order=result_sold['Clothing'], ax=ax1)
ax1.set_xlabel("Clothing Category")
ax1.set_ylabel("Amount Sold")
ax1.set_title("Cummulative Sales per Clothing Category")
sns.barplot(x='Clothing',y='Discount',data=df, order=result_discount['Clothing'], ax=ax2)
ax2.set_xlabel("Clothing Category")
ax2.set_ylabel("Discount %")
ax2.set_title("Discount % per Clothing Category")



Overall, it appears with the exception of dresses, there is an overall inverse relationship between cummulative sales by clothing category to the discount percent for said category. In other words, the higher the percentage of discount (more negative per the above graph), then the larger overall volume of clothes sales!

In [None]:
df['rating_bins'] = pd.cut(df['rating'],bins=[0,1,2,3,4,5], labels=['1*','2*','3*','4*','5*'])

In [None]:
ratings_sold = df.groupby("rating_bins")['units_sold'].sum().reset_index().sort_values(by='units_sold')
ratings_discount = df.groupby("rating_bins")['Discount'].mean().reset_index().sort_values(by='Discount')

In [None]:
f,(ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.barplot(x='rating_bins',y='units_sold',data=ratings_sold, order=ratings_sold['rating_bins'], ax = ax1)
ax1.set_xlabel("Ratings")
ax1.set_ylabel("Amount Sold")
ax1.set_title("Cummulative Sales by Ratings")
sns.barplot(x='rating_bins', y='Discount', data=ratings_discount, order=ratings_discount['rating_bins'], ax=ax2)
ax2.set_xlabel("Ratings")
ax2.set_ylabel("Discount %")
ax2.set_title("Discount % by Ratings")


Again we can see a largely inverse relationship between ratings and discount percentages as a function of sales. In other words, if a product is successfully selling, then it is unlikely to be discounted. Or at the very least, it will be discounted at a much lower percent.

Interestingly, middle to upper middle products appear to sell the most items by cummulative counts. This may be a function of percentage breakdown of product ratings, and won't be analyized this time, but would certainly be a factor to consider.

Next, let's take a look at the relationship between ratings and cost. Do better rated products inherently cost more?

In [None]:
price_bins = df.groupby('rating_bins')['price'].mean().reset_index().sort_values('price')
discount_bins = df.groupby('rating_bins')['retail_price'].mean().reset_index().sort_values('retail_price')

In [None]:
f,(ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.barplot(x='rating_bins', y='price', data=price_bins, order=price_bins['rating_bins'], ax=ax1)
ax1.set_xlabel('Ratings')
ax1.set_ylabel('Sale Price')
ax1.set_title("Sale Price vs Ratings")
sns.barplot(x='rating_bins', y='retail_price', data=discount_bins, order=discount_bins['rating_bins'],ax=ax2)
ax2.set_xlabel("Ratings")
ax2.set_ylabel("Retail Price")
ax2.set_title("Retail Price vs Ratings")

Interestingly, the actual sale price largely follows the expected relationship of paying more for a product with higher customer reviews. However, this doesn't seem to be the case with the original retail price. Instead, lower rated products retail significantly more than higher or medium ranked products.

This largely follows the previous chart of discount percent by rating. These originally low rated, high retail products are sold at some of the highest discount percentages. Ultimately this suggests a correlation between initial retail, ratings, and the end need to discount products to induce sales.

In [None]:
sns.distplot(df['Discount'])

The issue with our discount distribution is is is essentially two set of data merged into one. Let's create another feature that tracks if there is or is not a discount. We will use our mean of the data to establish our boundary. At ~25% we will say anything above is on sale, and anything below is selling at normal retail price.

In [None]:
df[['Discount']].describe()

In [None]:
df['Sale'] = df['Discount']>df['Discount'].mean()
df['Sale'].replace({False:0, True:1}, inplace=True)

Now let's break down our previous graphs into categories based on using ads to boost sales and the binary assessment of product being on sale.

In [None]:
ratings_sold2 = df.groupby(["rating_bins", "uses_ad_boosts"])['units_sold'].sum().reset_index().sort_values(by='units_sold')
ratings_discount2 = df.groupby(["rating_bins","uses_ad_boosts"])['Discount'].mean().reset_index().sort_values(by='Discount')

In [None]:
f,(ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.barplot(x='rating_bins',y='units_sold', hue='uses_ad_boosts', data=ratings_sold2, ax = ax1)
ax1.set_xlabel("Ratings")
ax1.set_ylabel("Amount Sold")
ax1.set_title("Cummulative Sales by Ratings")
sns.barplot(x='rating_bins', y='Discount', hue='uses_ad_boosts', data=ratings_discount2, ax=ax2)
ax2.set_xlabel("Ratings")
ax2.set_ylabel("Discount %")
ax2.set_title("Discount % by Ratings")

The use of ads does not appear to correlate to cummulative sales or discount percentages with regard to user rating reviews. Let's see how it correlates to sale price and retail price.

In [None]:
price_bins2 = df.groupby(['rating_bins','uses_ad_boosts'])['price'].mean().reset_index().sort_values('price')
discount_bins2 = df.groupby(['rating_bins','uses_ad_boosts'])['retail_price'].mean().reset_index().sort_values('retail_price')

In [None]:
f,(ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.barplot(x='rating_bins', y='price', hue='uses_ad_boosts', data=price_bins2, ax=ax1)
ax1.set_xlabel('Ratings')
ax1.set_ylabel('Sale Price')
ax1.set_title("Sale Price vs Ratings")
sns.barplot(x='rating_bins', y='retail_price', hue='uses_ad_boosts', data=discount_bins2,ax=ax2)
ax2.set_xlabel("Ratings")
ax2.set_ylabel("Retail Price")
ax2.set_title("Retail Price vs Ratings")

Overall, there doesn't appear to be a major change in retail or sale price based upon the use of ads. Now let's try using the binary sale feature for evaluation.

In [None]:
ratings_sold3 = df.groupby(["rating_bins", "Sale"])['units_sold'].sum().reset_index().sort_values(by='units_sold')
ratings_discount3 = df.groupby(["rating_bins","Sale"])['Discount'].mean().reset_index().sort_values(by='Discount')

In [None]:
f,(ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.barplot(x='rating_bins',y='units_sold', hue='Sale', data=ratings_sold3, ax = ax1)
ax1.set_xlabel("Ratings")
ax1.set_ylabel("Amount Sold")
ax1.set_title("Cummulative Sales by Ratings")
sns.barplot(x='rating_bins', y='Discount', hue='Sale', data=ratings_discount3, ax=ax2)
ax2.set_xlabel("Ratings")
ax2.set_ylabel("Discount %")
ax2.set_title("Discount % by Ratings")

Obviously, a product that is not on sale will have a negative discount percent (price markup). So this graph isn't necessarily a great insight.

In [None]:
price_bins3 = df.groupby(['rating_bins','Sale'])['price'].mean().reset_index().sort_values('price')
discount_bins3 = df.groupby(['rating_bins','Sale'])['retail_price'].mean().reset_index().sort_values('retail_price')

In [None]:
f,(ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.barplot(x='rating_bins', y='price', hue='Sale', data=price_bins3, ax=ax1)
ax1.set_xlabel('Ratings')
ax1.set_ylabel('Sale Price')
ax1.set_title("Sale Price vs Ratings")
sns.barplot(x='rating_bins', y='retail_price', hue='Sale', data=discount_bins3,ax=ax2)
ax2.set_xlabel("Ratings")
ax2.set_ylabel("Retail Price")
ax2.set_title("Retail Price vs Ratings")

Here's an interesting observation: The retail price for non-sale items are all around $8. In contrast, there is a wide range of retail prices corresponding to sale items. This intuitively makes sense though. If a product is initially listed at high retail and unable to sell, then ultimately it would be listed at a discounted price.

<h1>Model Construction</h1>
<p>Finally, let's try to predict unit sales based upon our feature engineering.</p>

In [None]:
num_cols=['price','retail_price','uses_ad_boosts','rating','rating_count','countries_shipped_to','Discount','Sale']
cat_cols=['inventory_total','urgency_text','origin_country','Gender','repeat','Clothing','rating_bins']

In [None]:
X = df[cat_cols+num_cols]

In [None]:
data_map = {'inventory_total':{'Full':1, 'Not Full': 0},
            'urgency_text' : {'Y': 1, 'N':0},
            'origin_country': {'CN': 1, 'Other':0},
            'Gender' : {'F':1, 'M':0},
            'repeat' : {'Y' : 1, 'N' : 0},
            'Clothing' : {'Shirt' : 1, 'Dress':2,'Swimsuit':3,'Shorts':4,'Romper':5,'Blouse':6,'Pants':7,'Tank':8,'Other':9,'Skirt':10},
            'rating_bins' : {'1*':1,'2*':2,'3*':3,'4*':4,'5*':5}
}

In [None]:
num_feats = X.select_dtypes(include=["int64","float64"]).columns

In [None]:
num_feats

In [None]:
X.replace(data_map, inplace=True)
y = df[['units_sold']]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_feats)])

In [None]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
    ]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))

We will wrap up our model construction here, without getting too into the details of hyperparameter tuning at this point. As we can see from our initial runs, Random Forest Classification and Gradient Boosting Classification yielded similar accuracies of ~75%. Not too bad for an initial model and no further tuning.

<h1>Conclusion</h1>
<p>Let's recap on what we set out to accomplish:
    <ul>
        <li> Evaluate if humans are sensitive to discount percents: ✔</li>
        <li> Evaluate the success of bad product sales: ✔</li>
        <li> Evaluate the relationship between product quality (based on user rating), sales, and price: ✔</li>
    </ul>
Further areas of imporovement include: 
<ul>
    <li>Further feature engineering</li>
    <li>Additional pipelines for hyperparameter tuning</li>
    <li>Further statistical analysis on the success of model selection</li>
</ul>

As a newer practitioner, please provide feedback and input below! Thanks!
    
</p>