## Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv')
df.head()

In [None]:
product_cat = pd.read_csv('../input/summer-products-and-sales-in-ecommerce-wish/unique-categories.sorted-by-count.csv')
product_cat.head()

In [None]:
product_cat_results = product_cat.groupby('keyword')['count'].mean().sort_values(ascending=False)
product_cat_results.head(60)

In [None]:
df.info()

## Look for any null values

In [None]:
import missingno as msno
msno.matrix(df)

In [None]:
df.describe()

## Features:
1. Title:  for localized for european countries. May be the same as title_orig if the seller did not offer a translation
2. Title_orig: Original english title of the product
3. price: price for buyer
4. retail_price: retail price, or reference price in other stores/places. Used by the seller to indicate a regular value or the price before discount.
5. currency_buyer
6. units_sold: Number of units sold. Lower bound approximation by steps
7. uses_ad_boosts: Whether the seller paid to boost his product within the platform (highlighting, better placement or whatever)
8. rating: Mean product rating
9. rating_count: Total number of ratings of the product
10. rating_five_count: Number of 5-star ratings
11. rating_four_count: Number of 4-star ratings
12. rating_three_count: Number of 3-star ratings
13. rating_two_count: Number of 2-star ratings
14. rating_one_count: Number of 1-star ratings
15. badges_count: number of badges the product or seller have
16. badges_local_product: A badge that denotes the product is a local product. Conditions may vary (being produced locally, or something else). Some people may prefer buying local products rather than. 1 means Yes, has the badge
17. badge_product_quality: Badge awarded when many buyers consistently gave good evaluations 1 means Yes, has the badge
18. badge_fast_shipping: Badge awarded when this product's order is consistently shipped rapidly
19. tags: tags set by the seller
20. product_color: Product's main color
21. product_variation_size_id: One of the available size variation for this product
22. product_variation_inventory: Inventory the seller has. Max allowed quantity is 50
23. shipping_option_name
24. shipping_option_price: shipping price
25. shipping_is_express: whether the shipping is express or not. 1 for True
26. countries_shipped_to: Number of countries this product is shipped to. Sellers may choose to limit where they ship a product to
27. inventory_total: Total inventory for all the product's variations (size/color variations for instance)
28. has_urgency_banner: whether there was an urgency banner with an urgency
29. urgency_text: A text banner that appear over some products in the search results.
30. origin_country
31. merchant_title: Merchant's displayed name (show in the UI as the seller's shop name)
32. merchant_name: Merchant's canonical name. A name not shown publicly. Used by the website under the hood as a canonical name. Easier to process since all lowercase without white space
33. merchant_info_subtitle: The subtitle text as shown on a seller's info section to the user. (raw, not preprocessed). The website shows this to the user to give an overview of the seller's stats to the user. Mostly consists of "% <positive_feedbacks> (<rating_count> reviews)" written in french
34. merchant_rating_count: Number of ratings of this seller
35. merchant_rating: merchant's rating
36. merchant_id: merchant unique id
37. merchant_has_profile_picture: Convenience boolean that says whether there is a "merchant_profile_picture" url
38. merchant_profile_picture: Custom profile picture of the seller (if the seller has one). Empty otherwise.
39. product_url: url to the product page. You may need to login to access it
40. product_picture
41. product_id: product identifier. You can use this key to remove duplicate entries if you're not interested in studying them.
42. theme: the search term used in the search bar of the website to get these search results.
43. crawl_month: meta for info only.

## Remove duplicate values

In [None]:
## Duplicate values

duplicate_series = df['product_id'].duplicated(keep='first') 
df[duplicate_series]

In [None]:
df.drop_duplicates(subset ="product_id", keep = 'first', inplace = True)

In [None]:
df.isnull().sum()

## Target Feature: Units sold

In [None]:
print(df['units_sold'].value_counts())
df['units_sold'].hist();

#### Units sold below 10 are very few, will combine with the "10" units sold.

## Separate Categorical and Numeric features

In [None]:
df_num = df[['price', 'retail_price', 'units_sold', 'rating_count', 'rating_five_count', 'rating_four_count',
             'rating_three_count', 'rating_two_count', 'rating_one_count', 'badges_count', 'product_variation_inventory', 
             'inventory_total', 'shipping_option_price', 'countries_shipped_to', 'merchant_rating_count']]

df_cat = df[['title', 'title_orig','currency_buyer','uses_ad_boosts', 'rating','badge_local_product', 
             'badge_product_quality', 'badge_fast_shipping','tags', 'product_color', 'product_variation_size_id',
             'shipping_option_name', 'shipping_is_express', 'countries_shipped_to','has_urgency_banner', 
             'urgency_text', 'origin_country', 'merchant_title', 'merchant_name', 'merchant_info_subtitle',
             'merchant_rating','merchant_id', 'merchant_has_profile_picture','product_url', 'product_picture',
             'product_id', 'theme', 'crawl_month']]

In [None]:
df_cat.head()

In [None]:
df_num.head()

## Visualize correlation with numerical features

In [None]:
corr_matrix = df_num.corr()
fig = plt.figure(figsize=(19, 15))
sns.heatmap(corr_matrix, annot=True);

In [None]:
sns.pairplot(data=df_num,
                  x_vars=['price', 'retail_price', 'rating_count', 'badges_count', 'inventory_total'],
                  y_vars=['units_sold']);

In [None]:
sns.pairplot(data=df_num,
                  x_vars=['product_variation_inventory', 'shipping_option_price',
                          'countries_shipped_to', 'merchant_rating_count' ],
                  y_vars=['units_sold']);

## Distribution of numerical features

In [None]:
df_num.columns

In [None]:
def hist_num(x):
    print(df_num[x].value_counts())
    df_num[x].hist()

In [None]:
hist_num('price')

In [None]:
hist_num('retail_price')

In [None]:
hist_num('rating_count')

In [None]:
hist_num('rating_five_count')

In [None]:
hist_num('rating_four_count')

In [None]:
hist_num('rating_three_count')

In [None]:
hist_num('rating_two_count')

In [None]:
hist_num('rating_one_count')

In [None]:
hist_num('badges_count')

In [None]:
hist_num('product_variation_inventory')

In [None]:
hist_num('inventory_total')

In [None]:
hist_num('shipping_option_price')

In [None]:
hist_num('countries_shipped_to')

In [None]:
hist_num('merchant_rating_count')

## Distribution in categorical features

In [None]:
for i in df_cat.columns[2:8]:
    cat_num = df_cat[i].value_counts()
    print("graph for %s: total = %d" % (i, len(cat_num)))
    chart = sns.barplot(x=cat_num.index, y=cat_num)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.show()

In [None]:
for i in df_cat[['product_color', 'product_variation_size_id','shipping_option_name', 'shipping_is_express', 
                'countries_shipped_to', 'origin_country','merchant_has_profile_picture','theme']]:
    cat_num = df_cat[i].value_counts()
    print("graph for %s: total = %d" % (i, len(cat_num)))
    chart = sns.barplot(x=cat_num.index, y=cat_num)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.show()

In [None]:
df_cat.columns

## Correlation of categories with units sold

In [None]:
df.groupby('uses_ad_boosts')['units_sold'].mean()

In [None]:
rating_results = df.groupby('rating')['units_sold'].mean().sort_values(ascending=False)
rating_results.head(15)

In [None]:
df.groupby('badge_local_product')['units_sold'].mean().sort_values(ascending=False)

In [None]:
df.groupby('badge_product_quality')['units_sold'].mean().sort_values(ascending=False)

In [None]:
df.groupby('badge_fast_shipping')['units_sold'].mean().sort_values(ascending=False)

In [None]:
color_results = df.groupby('product_color')['units_sold'].mean().sort_values(ascending=False)
color_results.head(30)

In [None]:
variation_size_id_results = df.groupby('product_variation_size_id')['units_sold'].mean().sort_values(ascending=False)
variation_size_id_results.head(20)

In [None]:
df.groupby('shipping_option_name')['units_sold'].mean().sort_values(ascending=False)

In [None]:
for col in df[['countries_shipped_to','has_urgency_banner', 'urgency_text', 'origin_country','merchant_title', 
               'merchant_name', 'merchant_info_subtitle','merchant_rating', 'merchant_id', 'merchant_has_profile_picture',
               'merchant_profile_picture', 'product_url', 'product_picture','product_id']]:
    result = df.groupby(col)['units_sold'].mean().sort_values(ascending=False)
    print(result.head(20))

## Filling in the null values

In [None]:
df.isnull().sum()

In [None]:
df.loc[(df['rating_five_count'].isnull()) & (df['rating_four_count'].isnull()) 
       & (df['rating_three_count'].isnull()) & (df['rating_two_count'].isnull()) 
       & (df['rating_one_count'].isnull()), 'rating'] = 0

#### All the rows with missing rating counts are the same

In [None]:
df.loc[df['rating'] == 0, 'rating_five_count'] = 0
df.loc[df['rating'] == 0, 'rating_four_count'] = 0
df.loc[df['rating'] == 0, 'rating_three_count'] = 0
df.loc[df['rating'] == 0, 'rating_two_count'] = 0
df.loc[df['rating'] == 0, 'rating_one_count'] = 0

In [None]:
df['rating'].hist()

In [None]:
df.loc[df['has_urgency_banner'].isnull(), 'has_urgency_banner'] = 0

In [None]:
df.groupby('has_urgency_banner')['units_sold'].mean()

In [None]:
df['urgency_text'].value_counts()

In [None]:
df.loc[df['urgency_text'].isnull(), 'urgency_text'] = 'No Text'

In [None]:
df.isnull().sum()

In [None]:
df.groupby('urgency_text')['units_sold'].mean()

## Add new features (men's or women's fashion), simplified title, and color

In [None]:
product_cat_results.head(60)

In [None]:
product_cat_results.tail(60)

In [None]:
def title_simplifier(title):
    if 'dress' in title.lower() or 'halter' in title.lower():
        return 'Dress'
    elif 'swimwear' in title.lower() or 'swimming' in title.lower() or 'swimsuit' in title.lower() or 'bikini' in title.lower() or 'tankini' in title.lower() or 'swim' in title.lower() or 'beach' in title.lower():
        return 'Swimwear'
    elif 'pant' in title.lower() or 'legging' in title.lower() or 'jean' in title.lower() or 'trouser' in title.lower():
        return 'Pants'
    elif 'short' in title.lower():
        return 'Shorts'
    elif 'skirt' in title.lower():
        return 'Skirt'
    elif 'top' in title.lower() or 'blouse' in title.lower() or 'shirt' in title.lower() or 'sweatshirt' in title.lower() or 'sweater' in title.lower() or 'vest' in title.lower() or 'tank top' in title.lower():
        return 'Top'
    elif 'sport' in title.lower() or 'yoga' in title.lower() or 'fitness' in title.lower() or 'running' in title.lower() or 'athletic' in title.lower(): 
        return 'Sportswear'
    elif 'romper' in title.lower() or 'jumpsuit' in title.lower() or 'overalls' in title.lower() or 'bodysuit' in title.lower():
        return 'Onepiece'
    elif 'shoe' in title.lower() or 'slipper' in title.lower() or 'sneaker' in title.lower():
        return 'Footwear'
    elif 'pajama' in title.lower() or 'pyjama' in title.lower() or 'sleep' in title.lower() or 'sleepwear' in title.lower():
        return 'Sleepwear'
    else:
        return 'Accessories'

In [None]:
df['title_simple'] = df['title_orig'].apply(lambda x: title_simplifier(x))

In [None]:
df['title_simple'].value_counts()

In [None]:
df.groupby('title_simple')['units_sold'].mean().sort_values(ascending=False)

In [None]:
def fashion_category(title):
    if "women's fashion" in title.lower() or 'women fashion' in title.lower() or 'women' in title.lower():
            return "Women's Fashion"
    else:
        return "Men's Fashion"

In [None]:
df['product_category'] = df['title_orig'].apply(lambda x: fashion_category(x))

In [None]:
df['product_category'].value_counts()

In [None]:
color_results.tail(60)

In [None]:
def color_simplify(title):
    if '&' in title.lower():
        return 'two-colors'
    elif 'green' in title.lower() or 'army' in title.lower():
        return 'green'
    elif 'navy' in title.lower() or 'blue' in title.lower():
        return 'blue'
    elif 'burgundy' in title.lower() or 'red' in title.lower() or 'wine' in title.lower():
        return 'red'
    elif 'rosegold' in title.lower() or 'pink' in title.lower():
        return 'pink'
    elif 'white' in title.lower():
        return 'white'
    elif 'black' in title.lower():
        return 'black'
    elif 'grey' in title.lower() or 'gray' in title.lower():
        return 'grey'
    elif 'yellow' in title.lower():
        return 'yellow'
    elif 'orange' in title.lower():
        return 'orange'
    elif 'khaki' in title.lower() or 'beige' in title.lower():
        return 'beige'
    elif 'multicolor' in title.lower() or 'rainbow' in title.lower():
        return 'multicolor'
    elif 'brown' in title.lower() or 'tan' in title.lower() or 'camel' in title.lower() or 'coffee' in title.lower():
        return 'brown'
    elif 'violet' in title.lower():
        return 'violet'
    else:
        return 'others'

In [None]:
df.loc[df['product_color'].isnull(), 'product_color'] = 'others'

In [None]:
df['color_simple'] = df['product_color'].apply(lambda x: color_simplify(x))

In [None]:
df['color_simple'].value_counts()

In [None]:
df.groupby('color_simple')['units_sold'].mean().sort_values(ascending=False)

## Adjust the distribution of origin contries and units sold

In [None]:
df['origin_country'] = df['origin_country'].replace(np.nan, 'Other')
df['origin_country'] = df['origin_country'].replace('VE', 'Other')
df['origin_country'] = df['origin_country'].replace('SG', 'Other')
df['origin_country'] = df['origin_country'].replace('GB', 'Other')
df['origin_country'] = df['origin_country'].replace('AT', 'Other')

In [None]:
df['origin_country'].value_counts()

In [None]:
df['units_sold'] = df['units_sold'].replace(1, 10)
df['units_sold'] = df['units_sold'].replace(8, 10)
df['units_sold'] = df['units_sold'].replace(7, 10)
df['units_sold'] = df['units_sold'].replace(3, 10)
df['units_sold'] = df['units_sold'].replace(2, 10)
df['units_sold'] = df['units_sold'].replace(6, 10)

In [None]:
df['units_sold'].value_counts()

## Choose columns to fit into model

In [None]:
predictors = ['price', 'retail_price', 'rating', 'rating_count', 'rating_five_count',
              'rating_four_count','rating_three_count', 'rating_two_count', 'rating_one_count',
              'badges_count', 'product_variation_inventory', 'shipping_option_price',
              'countries_shipped_to', 'inventory_total', 'merchant_rating_count', 'merchant_rating',
              'units_sold', 'uses_ad_boosts','badge_local_product', 'badge_product_quality', 
              'badge_fast_shipping','shipping_is_express', 'has_urgency_banner',
              'origin_country', 'merchant_has_profile_picture','title_simple', 'product_category', 'color_simple']

In [None]:
df[predictors].isnull().sum()

In [None]:
df_model = df[predictors]

In [None]:
df_model.head()

## Feature Scaling 

In [None]:
df_model.iloc[:,0:16]

## Will use standard scaler to scale the continuous features

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_model_scaled = scaler.fit_transform(df_model.iloc[:,0:16])

In [None]:
df_model_scaled

In [None]:
df_model.iloc[:,0:16] = df_model_scaled

In [None]:
df_model.head()

## Model Building
1. Ridge regression
2. Lasso Regression
3. Elastic Net Regression
3. Random Forest Regression
4. Gradient Boosted Regression

In [None]:
df_dum = pd.get_dummies(df_model, columns=['uses_ad_boosts', 'badge_local_product', 'badge_product_quality',
                                           'badge_fast_shipping', 'shipping_is_express',
                                           'has_urgency_banner', 'origin_country', 'merchant_has_profile_picture',
                                           'title_simple', 'product_category', 'color_simple'])

In [None]:
df_dum.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df_dum.drop('units_sold', axis =1)
y = df_dum.units_sold.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score

#Linear Regression
ridge_model = Ridge(alpha=1)
ridge_model.fit(X_train, y_train)

In [None]:
np.mean(cross_val_score(ridge_model,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))

In [None]:
#Lasso regression (normalizes sparse data)

lasso_model = Lasso(alpha=0.13)
lasso_model.fit(X_train,y_train)

In [None]:
np.mean(cross_val_score(lasso_model,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))

In [None]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train,y_train)

In [None]:
np.mean(cross_val_score(elastic_net,X_train,y_train, scoring = 'neg_mean_absolute_error', cv= 3))

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor()

np.mean(cross_val_score(forest_model,X_train,y_train,scoring = 'neg_mean_absolute_error', cv= 3))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

GB_model = GradientBoostingRegressor()
np.mean(cross_val_score(GB_model,X_train,y_train,scoring = 'neg_mean_absolute_error', cv= 3))

Based on the cross validation, Random Forest performed the best

## Fit training data into Random forest

In [None]:
forest_model.fit(X_train,y_train)

In [None]:
test_pred_forest = forest_model.predict(X_test)

## Test Random Forest

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, test_pred_forest)

In [None]:
from sklearn.metrics import r2_score

r2_score(y_test,test_pred_forest)

## Feature Importance
Code from
https://github.com/mrdbourke/your-first-kaggle-submission/blob/master/kaggle-titanic-dataset-example-submission-workflow.ipynb

In [None]:
def feature_importance(model, data):
    fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': data.columns})
    fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
    _ = fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20, 10))
    return fea_imp

In [None]:
feature_importance(forest_model, X_train)

#### Hope this helps!
I am very new to machine learning so please comment and provide any feedback on how to improve.