In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
from plotly.offline import iplot, init_notebook_mode

import cufflinks as cf
import plotly.graph_objs as go
# import chart_studio.plotly as py

init_notebook_mode(connected=True)
cf.go_offline(connected=True)

# Set global theme
cf.set_config_file(world_readable=True, theme='ggplot')

# Data Info

## Context

Studying top products requires more than just product listings. You also need to know what sells well and what does not.

## Content

This dataset contains product listings as well as products ratings and sales performance, which you would not find in other datasets.

With this, you can finally start to look for correlations and patterns regarding the success of a product and the various components.

## Inspiration

How about trying to validate the established idea of human sensitiveness to price drops ? (discounted price compared to original retail_price)

You may look for top categories of products so that you know what sells best

Do bad products sell ? 

How about the relationship between the quality of a product (ratings) and its success ? 

Does the price factor into this ?



## Infos on Columns

1. title : Title for localized for european countries. May be the same as title_orig if the seller did not offer a translation.

2. title_orig : Original english title of the product.

3. price : price for the buyer

4. retail_price : Retail price, or reference price in other stores/places. Used by the seller to indicate a regular value or the price before discount.

5. currency_buyer : currency of the prices

6. units_sold : Number of units sold. Lower bound approximation by steps

7. uses_ad_boosts : Whether the seller paid to boost his product within the platform (highlighting, better placement or whatever).

8. rating : Mean product rating.

9. rating_count : Total number of ratings of the product

10. rating_five_count  : Number of 5-star ratings (there are also similar rating columns for four, three .. stars)

11. badges_count : Number of badges the product or the seller have.

12. badge_local_product : A badge that denotes the product is a local product. Conditions may vary (being produced locally, or something else).  Some people may prefer buying local products rather than. 1 means Yes, has the badge. 

13. badge_product_quality : Badge awarded when many buyers consistently gave good evaluations 1 means Yes, has the badge

14. badge_fast_shipping : Badge awarded when this product's order is consistently shipped rapidly

15. tags : tags set by the seller

16. product_color : Product's main color

17. product_variation_size_id : One of the available size variation for this product

18. product_variation_inventory : Inventory the seller has. Max allowed quantity is 50

19. shipping_option_price : shipping price

20. shipping_is_express : whether the shipping is express or not. 1 for True

21. countries_shipped_to : Number of countries this product is shipped to. Sellers may choose to limit where they ship a product to

22. inventory_total : Total inventory for all the product's variations (size/color variations for instance)

23. has_urgency_banner : whether there was an urgency banner with an urgency

24. merchant_rating : merchant's rating

*Note: Not all the columns are present in the above description.*


In [None]:
df = pd.read_csv("/kaggle/input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv")

In [None]:
uniuqe_categories = pd.read_csv("/kaggle/input/summer-products-and-sales-in-ecommerce-wish/unique-categories.csv")
uniuqe_categories_count = pd.read_csv("/kaggle/input/summer-products-and-sales-in-ecommerce-wish/unique-categories.sorted-by-count.csv")

In [None]:
df.head(2)

# Explorations

In [None]:
print(df.info())

## Merchant Columns

In [None]:
print(df.loc[:,df.columns.str.startswith("merchant")].columns.values)

lets drop everything related to merchent except *merchant_id, merchant_rating_count,and merchant_ratings*.

In [None]:
df.drop(['merchant_has_profile_picture', 'merchant_profile_picture','merchant_title' ,'merchant_name', 'merchant_info_subtitle'], inplace=True, axis=1)

## Null Columns 

In [None]:
df.isnull().sum()

In [None]:
df.loc[:,df.isnull().sum()>0].columns


## Theme column

In [None]:
df.theme.value_counts()

Theme column has only one value *summer*, so its no-use for analysis or model prediction. Lets drop this column.


In [None]:
df.drop('theme', axis=1, inplace=True)

## Title and Title_orig column

Title and title_orig columns share same value, for our case, lets use the one with english tilte i.e. title_orig and drop title column.


In [None]:
df.drop("title", axis=1, inplace=True)

## Urgency Text and Urgency Banner

Both columns have null values, and in very large number so lets drop them.


In [None]:

df.drop(['urgency_text','has_urgency_banner'], inplace=True,axis=1)

Fill the rest of null-columns by value "unknown".

In [None]:
df.fillna(value="unknown", inplace=True)

## Currency Column

In [None]:
df.currency_buyer.unique()

Since the data was only taken from France, currency is only in euros. Lets remember that and drop the columns

In [None]:
df.drop('currency_buyer', inplace=True, axis=1)

## Crawl Month

In [None]:
df.crawl_month.unique()

Looke like crawl month is only from August, lets drop this column too.

In [None]:
df.drop('crawl_month', inplace=True, axis=1)

## Badges Columns

In [None]:
df.loc[:,df.columns.str.startswith('badge')].columns

Lets convert **'badge_local_product', 'badge_product_quality', 'badge_fast_shipping'** into categorical values.

In [None]:
df[['badge_local_product', 'badge_product_quality','badge_fast_shipping']] = df[['badge_local_product', 'badge_product_quality','badge_fast_shipping']].astype(str)

# EDA 

lets do some explorations via visualizations

## Origin Country

In [None]:
eda_df = df.copy()

## Rename country columns for clear meaning

In [None]:
eda_df.origin_country = eda_df.origin_country.str.replace( 'CN',"China" )
eda_df.origin_country = eda_df.origin_country.str.replace( "US","United States of America" )
eda_df.origin_country = eda_df.origin_country.str.replace( "unknown","unknown" )
eda_df.origin_country = eda_df.origin_country.str.replace( "VE","Venezuela" )
eda_df.origin_country = eda_df.origin_country.str.replace( 'GB',"Great Britain" )
eda_df.origin_country = eda_df.origin_country.str.replace( 'SG',"Singapore" )
eda_df.origin_country = eda_df.origin_country.str.replace( 'AT',"Austria" )
    

In [None]:
labels = eda_df.origin_country.value_counts(normalize=True).index.values

values  = eda_df.origin_country.value_counts().values

# Create Pie Chart

fig = go.Figure()
fig.add_trace(go.Pie(labels=labels, values=values))
fig.update_layout(title="Country of Origin of Product in Wish", legend_title="Countries", template="plotly_dark")



Seems the products mostly originate from China.

In [None]:
# Lets create so called discounts column by subtracting the price from  retail_price

eda_df['discounted_price'] = eda_df['retail_price'] - eda_df['price']
prices_by_country = eda_df[['price','discounted_price','retail_price','origin_country']].groupby('origin_country').mean()

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x=prices_by_country.index.values, y=prices_by_country.price, name="Price"))
fig.add_trace(go.Scatter(x=prices_by_country.index.values, y=prices_by_country.discounted_price, name="Discounted Price"))
fig.add_trace(go.Bar(x=prices_by_country.index.values, y=prices_by_country.retail_price, name="Retail Price"))
fig.update_layout(title="Prices Categories By Country", xaxis_title="Countries", yaxis_title="Avg Discount Prices", template="plotly_dark", legend_title="Legend")


The graph, displpays that theres a heavy discounts on prodcuts from Venezuela as displayed by red line by avg of around 27 euros. 

Surprisingly, from the period of July,2020, when the data was taken, selling prices are higher than retail prices in countries like **Austria, GB and Singapore**. China where most of the product in the data is coming from is sold on average of around 8.5 euros with discounts on average of around 14 euros.



## Prices In China 

In [None]:
eda_df[eda_df.origin_country=="China"]['price'].describe()

About 75% of products coming from China are near 10 euros.

In [None]:
layout=dict(title="Selling Price Ranges In China", xaxis_title="Prices", yaxis_title="Frequency",)
eda_df[eda_df.origin_country=="China"]['price'].iplot(kind="hist", bins=50 , layout=layout)

In [None]:
eda_df[eda_df.origin_country=="China"]['retail_price'].describe()

In [None]:
layout=dict(title="Original Price Ranges In China", xaxis_title="Prices", yaxis_title="Frequency",)
eda_df[eda_df.origin_country=="China"]['retail_price'].iplot(kind="hist", layout=layout)

## Shipping Options and Prices

In [None]:
eda_df.loc[:,eda_df.columns.str.startswith("shipping")].columns

In [None]:
eda_df['shipping_option_name'].value_counts()

**Livraison standard** is quite populuar option for shipping. Lets check the prices of the company.

In [None]:

livrasion_prices = eda_df[eda_df.shipping_option_name =='Livraison standard']['shipping_option_price'].value_counts().index.values
livrasion_prices_frquency = eda_df[eda_df.shipping_option_name =='Livraison standard']['shipping_option_price'].value_counts().values

fig = go.Figure()
fig.add_trace(go.Pie(labels=livrasion_prices, values=livrasion_prices_frquency))
fig.update_layout(title="Livrasion Standard Prices", legend_title="Prices In Euros", template="plotly_dark")



Most customers choose shipping options from 1-3 euros. 

In [None]:
eda_df['shipping_is_express'].value_counts()

Almost all the shipping is not express

In [None]:
eda_df.info()

## Products and Sales

Lets try and make a small df thats related to product and their sales.

In [None]:
product_cat_columns = eda_df.loc[:,eda_df.columns.str.startswith("product")].columns.values


In [None]:
eda_df[product_cat_columns].info()

In [None]:
eda_df[product_cat_columns].head()

Lets drop links 

In [None]:
df.drop(['product_picture','product_url'], inplace=True, axis=1)
eda_df.drop(['product_picture','product_url'], inplace=True, axis=1)

In [None]:
eda_df_products = eda_df[['tags', 'price', 'units_sold', 'rating','rating_count', 'product_id','badges_count', 'badge_product_quality']].copy().sort_values(['units_sold','badges_count'], ascending=False)

eda_df_products_by_id = eda_df_products.set_index('product_id')

In [None]:
eda_df_products_by_id.head()

If u look at the top sold products and their respective badge count then it does not seem there's a positive correlation, however, the units_sold is not clear on months, product relase dates, and so on..

The top 6 products sold are 100k while others are at 50k, so thats a massive difference. 

In [None]:
# Top 10 products sold for women
eda_df_products.loc[eda_df_products.tags.str.contains('[Ww]omen')].head(10).index

In [None]:
# Top 10 products in general
eda_df_products.head(10).index 

The index is same for both in general and women products, so top buyers are ladies or for ladies in wish.

Due to huge variation in units plotting without normalizing was not quite helpful.So, lets first normalize and then plot.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
eda_df_products_by_id_norm = eda_df_products_by_id.copy()
eda_df_products_by_id_norm.iloc[:,1:] = scaler.fit_transform(eda_df_products_by_id_norm.iloc[:,1:])

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x=eda_df_products_by_id_norm.head(20).index.values,y=eda_df_products_by_id_norm.head(20).units_sold,name="Units Sold"  ))
fig.add_trace(go.Scatter(x=eda_df_products_by_id_norm.head(20).index.values,y=eda_df_products_by_id_norm.head(20).price, mode="lines+markers", name="Price" ))
fig.add_trace(go.Scatter(x=eda_df_products_by_id_norm.head(20).index.values,y=eda_df_products_by_id_norm.head(20).rating_count,mode="lines+markers",name="Rating Counts"  ))
fig.add_trace(go.Scatter(x=eda_df_products_by_id_norm.head(20).index.values,y=eda_df_products_by_id_norm.head(20).rating,mode="lines+markers",name="Avg. Rating"  ))

fig.update_layout(title="Top 20 Products Sold", legend_title="Features")

In [None]:
eda_df_products_by_id.head(20).describe()

In [None]:
eda_df_products_by_id.head(10)

From plot and description, 75%  of the prodcut's cost less than 10 euros and products have average ratings of 3.8.

## Discounts, Ratings and Sales

We'll first select some columns seems more relevant to the context. After that lets apply binning of every 1k units sold to get a better grasp of proper range of sale and then analyse it. 

In [None]:
dis_rat_slaes = eda_df[['rating', 'product_id', 'units_sold', 'price','discounted_price']]
dis_rat_slaes.set_index('product_id').head()

In [None]:
bins_per_1k= [i for i in range(0,101001,1000)]
labels_bins_per_1k = [str(vals)[:-3]+"k's" for vals in bins_per_1k[1:]]

In [None]:
bins_per_1k_units = pd.cut(dis_rat_slaes.units_sold,bins_per_1k, labels=labels_bins_per_1k )

In [None]:
dis_rat_slaes['bins_per_1k_units'] = bins_per_1k_units

In [None]:
dis_rat_slaes.head()

In [None]:
dis_rat_slaes_per_1k_units_sold = dis_rat_slaes.groupby('bins_per_1k_units').agg('mean')

dis_rat_slaes_per_1k_units_sold

lets drop the NAN columns since they are not very helpful.

In [None]:
dis_rat_slaes_per_1k_units_sold.dropna(how='all', inplace=True, axis=0)

In [None]:
dis_rat_slaes_per_1k_units_sold

In [None]:
#Plots

fig = go.Figure()


fig.add_trace(go.Bar(x=dis_rat_slaes_per_1k_units_sold.index.values,y=dis_rat_slaes_per_1k_units_sold.price, name="Price" ))
fig.add_trace(go.Scatter(x=dis_rat_slaes_per_1k_units_sold.index.values,y=dis_rat_slaes_per_1k_units_sold.discounted_price,mode="lines+markers",name="Discounted Price"  ))
fig.add_trace(go.Bar(x=dis_rat_slaes_per_1k_units_sold.index.values,y=dis_rat_slaes_per_1k_units_sold.rating,name="Avg. Rating"  ))

fig.update_layout(title="Product Sales Per 1k Bins", legend_title="Features", xaxis_title="Units Sold", yaxis_title="Avg Values per 1000")

Some important information like product's age on website, release date and so on are not available on data. Hence, assuming or ignoring those facts, it seems higher the discount price more likely is the sale of product to be higher. 

On average of 1000 (missing k's in xaxis were null, i.e no units sold in that range.), the avg, customer rating has not changed much, ranging around 3.8 mostly.

While as for price, item's sold from 50k-100k are cheaper  than lesser sold items on average by 2 euros.

## Correlation Heatmap

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def customized_heatmap(corr_df):
    corr_df =corr_df.iloc[1:,:-1].copy()  

    
    # Get only half portion of corr_df to avoid repitition, so create mask    
    mask = np.triu(np.ones_like(corr_df), k=1)
    
     
    # plot a heatmap of the values
    plt.figure(figsize=(20,14))
    plt.title("Heatmap Corrleation")
    ax = sns.heatmap(corr_df, vmin=-1, vmax=1, cbar=False,
                     cmap='rainbow', mask=mask, annot=True)
    
    # format the text in the plot to make it easier to read
    for text in ax.texts:
        t = float(text.get_text())
        if -0.4 < t < 0.4:

#         if -0.5 < t < 0.5:
            text.set_text('')        
        else:
            text.set_text(round(t, 2))
        text.set_fontsize('x-large')
    plt.xticks( size='x-large')
    plt.yticks(rotation=0, size='x-large')
    plt.show()

In [None]:
!pip install dython

In [None]:
# Import dython to check correlations
from dython.nominal import associations


In [None]:
assoc = associations(eda_df,plot=False)
corr_eda_df_dython = assoc['corr']


In [None]:
customized_heatmap(corr_eda_df_dython)

# Preprocessing

In [None]:
preprocess_df = eda_df.copy()

Lets first deal with colinear columns 

## Ratings

Ratings column as shown in Heatmaps are highly colinear with each other as expected, but since "rating" column is average of several star rating column,lets drop one star to five star column and since we are using rating column to predict, lets convert the average rating column to a category with low, mediumn and high ratings divided according to following threshold.

<2.5 = Low, 2.5 <= medium < 3.75, >3.75 = high 

In [None]:
preprocess_df.loc[:,preprocess_df.columns.str.startswith("rating")].columns

In [None]:
preprocess_df.drop([ 'rating_five_count', 'rating_four_count','rating_three_count','rating_two_count', 'rating_one_count'], axis=1, inplace=True)

In [None]:
def five_rating_to_level_rating(val):
    if val<2.5:
        return "low"
    elif 2.5>= val <3.75:
        return "medium"
    else:
        return "high"
    
    

In [None]:
ratings = preprocess_df.rating.apply(five_rating_to_level_rating)

In [None]:
ratings.value_counts()

More on the dealing with class imbalances later below.

In [None]:

preprocess_df.rating = ratings

## IDs

Columns with ids will mislead our algorithms so lets drop them

In [None]:
preprocess_df.drop(['merchant_id', 'product_id'],axis=1, inplace=True)

## Origin Country, Shipping Names

These columns have about one value dominating about 98%. Moreover have very low correlation threshold. lets drop these two columns.

In [None]:
preprocess_df.drop(['origin_country', 'shipping_option_name'],axis=1, inplace=True)

In [None]:
preprocess_df.columns

## Tags 

Lets refine tags column 

In [None]:
# Lets check the proportion of top 20 tags count 
(uniuqe_categories_count['count'].head(20).sum() / uniuqe_categories_count['count'].sum())*100

Since top 20 tags are 41% of total tags lets repalce make bag of words string from those top 20 tagsb


In [None]:
bag_of_words =uniuqe_categories_count.keyword.head(20).str.lower().tolist()
# bag_of_words_reg_pattern =["\\b{}\\b".format(word) for word in bag_of_words]
# bag_of_words_reg_pattern_str =  "|".join(bag_of_words_reg_pattern)

bag_of_words

1. First replace uppercases with lowercases
2. Create separate columns with top 20 tags we created earlier. Then drop tags columns, also title_orig

In [None]:
for word in bag_of_words:
    # First check if str contains the word
    #If yes converto to 1 , if no convert to 0
    # Again convert 1 and 0 into strings for dummy variables later.
    
    preprocess_df["tag_"+word] = preprocess_df.tags.str.lower().str.contains(word).astype(int).astype(str)

In [None]:
preprocess_df.drop(['title_orig','tags'],axis=1,inplace=True)

## Product Color

The product color has positive correlation with inventory total and shipping price but this correlation does not makes sense. Lets not use this coloumn for prediction.


In [None]:
preprocess_df.drop('product_color', axis=1, inplace=True)

Discounted Price column has been created from the retial_price and price column so, lets not use the column.

In [None]:
preprocess_df.drop('discounted_price', axis=1, inplace=True)

# Modelling

## Classification

In [None]:
final_df = preprocess_df.copy()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier as DC

In [None]:
final_df_dummified = pd.get_dummies(final_df, drop_first=True)
final_df_dummified['rating'] = final_df['rating']

## Dealing With Dependent Class Imbalances

In [None]:
dependent_classes_labels= preprocess_df.rating.value_counts().index.values
dependent_classes_values = preprocess_df.rating.value_counts().values
fig = go.Figure()
fig.add_trace(go.Pie(labels=dependent_classes_labels, values=dependent_classes_values))
fig.update_layout(title="Imbalances in Dependent Classes", legend_title="Target Classes", template="plotly_dark")

One of the classes is highly dominant. This can cause model to be biased. Hence, lets try to fix this issue using Oversampling. I am doint it at last becuase SMOTE needs all ints or dummified data.

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
X = final_df_dummified.loc[:,final_df_dummified.columns!='rating']
y= final_df_dummified['rating']

In [None]:
sm = SMOTE(sampling_strategy= 'not majority', random_state=101,k_neighbors=2)

X_res,y_res = sm.fit_resample(X,y)

In [None]:
y_res.value_counts()

Now, the classes are balanced.

### Split 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res,y_res,random_state=101)

In [None]:
X_holdout, X_test_final, y_holdout, y_test_final = train_test_split(X_test, y_test,random_state=101)

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())])

pipe2 = Pipeline([('scaler_2', StandardScaler()), ('dc', DC())])

Lets calculate their time as well.

In [None]:
import time

In [None]:
rf_start = time.time()
pipe.fit(X_train,y_train)
rf_end = time.time()
eval_time_rf = rf_end -rf_start


In [None]:
dc_start = time.time()
pipe2.fit(X_train,y_train)
dc_end = time.time()
eval_time_dc = dc_end -dc_start


In [None]:
rf_start_pred = time.time()
pipe.predict(X_test)
rf_end_pred = time.time()
eval_time_rf_pred = rf_end_pred -rf_start_pred


In [None]:
dc_start_pred = time.time()
pipe2.predict(X_test)
dc_end_pred = time.time()
eval_time_dc_pred = dc_end_pred -dc_start_pred


In [None]:
print("Accuracy For Random forest on Validation Set: {}.".format(pipe.score(X_holdout,y_holdout)*100) )

print("Accuracy For Decision tree on Validation Set: {}.".format(pipe2.score(X_holdout,y_holdout)*100))

In [None]:
print("Accuracy For Random forest on Test Set: {}.".format(pipe.score(X_test_final,y_test_final)*100) )

print("Accuracy For Decision tree on Test Set : {}.".format(pipe2.score(X_test_final,y_test_final)*100))

In [None]:
print("Total time taken by RF to fit the model: {:.2f} sec".format(eval_time_rf))
print("Total time taken by Decision Tree to fit the model: {:.2f} sec".format(eval_time_dc))

In [None]:
print("Total time taken by RF to predict the test set: {:.2f} sec".format(eval_time_rf_pred))
print("Total time taken by Decision Tree to predict the test set: {:.2f} sec".format(eval_time_dc_pred))

Decision Tree seems to be doing better than Random Forest both in accuracy and time taken.