# Importing Libraries 

In [None]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

# Setting data frame display 
pd.set_option('display.max_columns', None)

# TASK #1: Reading & Cleaning the Data Frame 

### A. Reading the .csv Data

In [None]:
ecom_ram = pd.read_csv("../input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv")

### B. Analyzing Data Frame Columns

In [None]:
ecom_ram.head(10)

##### **Observation #1.1:** 'title', 'title_orig', 'product_url', 'product_picture' (URL), 'product_variation_size_id' are redundant as 'product_id' also represents the same data.  
##### **Observation #1.2:** 'merchant_name', 'merchant_title', 'merchant_info_subtitle' and ''merchant_profile_picture' (URL) are redundant as 'merchant_id' also represents the same data.   

In [None]:
ecom_ram.shape

### C. Finding Unnecessary Columns

#### #1. Finding columns with only one unique value

In [None]:
for col_name in ecom_ram.columns:
    if ecom_ram[col_name].value_counts().shape == (1,):
        print(col_name)

##### **Observation #1.3:** Above columns are unnecessary as they have only one unique value

#### #2. Finding columns with very few value counts in other indices (and less than 5 indices)

In [None]:
for col_name in ecom_ram.columns:
    if ecom_ram[col_name].value_counts().shape != (1,) and (ecom_ram[col_name].value_counts().shape[0] < 5) and ((ecom_ram[col_name].value_counts().iloc[0] - ecom_ram[col_name].value_counts().iloc[1]) > 1200):
            print(col_name)

In [None]:
ecom_ram['badges_count'].value_counts()

In [None]:
ecom_ram['badge_local_product'].value_counts()

In [None]:
ecom_ram['badge_product_quality'].value_counts()

In [None]:
ecom_ram['badge_fast_shipping'].value_counts()

In [None]:
ecom_ram['shipping_is_express'].value_counts()

##### **Observation #1.4:** Above columns are unnecessary as they possess very few value counts in other indices.

<!-- ### B. Obtaining the Unique Values in Each Object Type Column  -->

### D. Removing Unnecessary Columns

In [None]:
 ecom_ram_clean = ecom_ram.drop(['title', 'title_orig', 'currency_buyer', 'badge_local_product', 'badge_product_quality', 'badge_fast_shipping', 'merchant_name', 'merchant_title', 'merchant_profile_picture', 'product_picture', 'product_variation_size_id', 'product_url', 'shipping_option_name', 'shipping_is_express', 'theme', 'tags', 'urgency_text', 'crawl_month', 'merchant_info_subtitle'], axis =1) 

In [None]:
ecom_ram_clean.info()

##### **Observation #1.5:** Columns with object datatype can be converted to 'int' for convenience.
##### **Observation #1.6:** Some columns possess missing values. 

### E.  Converting Object Datatype Columns to Integers  

In [None]:
# Product Color Transformation 
pc_fit = ecom_ram_clean['product_color'].unique()
le = preprocessing.LabelEncoder()
le.fit(pc_fit)
ecom_ram_clean['product_color'] = le.transform(ecom_ram_clean['product_color'])

# Origin Country Transformation 
oc_fit = ecom_ram_clean['origin_country'].unique()
le.fit(oc_fit)
ecom_ram_clean['origin_country'] = le.transform(ecom_ram_clean['origin_country'])

# Merchant ID Transformation
mid_fit = ecom_ram_clean['merchant_id'].unique()
le.fit(mid_fit)
ecom_ram_clean['merchant_id'] = le.transform(ecom_ram_clean['merchant_id'])

# Product ID Transformation
pid_fit = ecom_ram_clean['product_id'].unique()
le.fit(pid_fit)
ecom_ram_clean['product_id'] = le.transform(ecom_ram_clean['product_id'])
ecom_ram_clean.head()

##### **#Result:** All object values have converted to integers

### F. Looking for Negative Values

In [None]:
(ecom_ram_clean < 0).values.any()

##### **#Result:** There are no negative values in the data frame .

### G. Looking for NaNs (Missing Values)

In [None]:
ecom_ram_clean.isna().sum()

### E. Filling Missing Values 

#### #1. Filling 'has_urgency_banner' column first. (with zeros where there is no urgency)

In [None]:
fill_value = {'has_urgency_banner':0} 
ecom_ram_clean = ecom_ram_clean.fillna(value = fill_value)
ecom_ram_clean['has_urgency_banner'].value_counts()

#### #2. Filling rest of the columns with  "most frequent" values

In [None]:
imputer = SimpleImputer(strategy = "most_frequent")
imputer.fit(ecom_ram_clean)
# imputer.statistics_
X = imputer.transform(ecom_ram_clean)
ecom_ram_tr = pd.DataFrame(X, columns = ecom_ram_clean.columns)
ecom_ram_tr.isna().sum()

##### **#Result:** All NaNs are filled with suitable values.

# TASK #2: Analysing and Improving Column-Wise Value Distribution

### A. Plotting the column wise distribution of the values   

In [None]:
%matplotlib inline
ecom_ram_tr.hist(bins = 50, figsize=(20,16)) 
plt.show()

##### **Observation #2.1:** 'inventory_total' and 'origin_country' seem to have very few value counts in other indices.   
##### **Observation #2.2:** Many columns seems to possess outliers.

### B: Removing Additional Columns

In [None]:
ecom_ram_tr['inventory_total'].value_counts()

##### **#Observation 2.3:** Very few values in indices other than '50.0' (Total sum of other indices is just 10). This column can be dropped.

In [None]:
ecom_ram_tr['origin_country'].value_counts()

##### **#Observation 2.4:** Very few values in indices other than '1.0' (Total sum of other indices is just 47). This column can be dropped.

In [None]:
ecom_ram_tr = ecom_ram_tr.drop(['inventory_total', 'origin_country'], axis =1)
ecom_ram_tr.info()

##### **#Result:** 'inventory_total' column is removed. 

### D: Detection and Removal of Outliers from All Columns

#### #1. Declaring Grubbs Test Function for Outlier Detection

In [None]:
def outlier_cols_ram(x): 
    n = len(x)
    mean_x = np.mean(x)
    sd_x = np.std(x)
    numerator = max(abs(x-mean_x))
    g_calculated = numerator/sd_x
    t_value = stats.t.ppf(1 - 0.05 / (2 * n), n - 2)
    g_critical = ((n - 1) * np.sqrt(np.square(t_value))) / (np.sqrt(n) * np.sqrt(n - 2 + np.square(t_value)))
    return col if (g_critical) < g_calculated else 0

#### #2. Finding all Columns with Outliers  

In [None]:
cols_with_outliers = []
for col in ecom_ram_tr.columns:
    outlier_col = outlier_cols_ram(ecom_ram_tr[col])
    cols_with_outliers.append(outlier_col)

while (cols_with_outliers.count(0)):
    cols_with_outliers.remove(0)
cols_with_outliers

##### **#Result:** Above columns possess outliers.

#### #3. Removing Outliers from all the Columns (Performing Cubical Transformation) 

In [None]:
for col in cols_with_outliers:
    ecom_ram_tr[col] = (ecom_ram_tr[col]**(1/3.7))

#### #4. Verifying the Outlier removal from all the Columns

In [None]:
any_outlier_col = []
for col in cols_with_outliers:
    outlier_col = outlier_cols_ram(ecom_ram_tr[col])
    any_outlier_col.append(outlier_col)

while (any_outlier_col.count(0)): 
    any_outlier_col.remove(0)
any_outlier_col

##### **#Result**: The above columns still possess outliers. 

#### #5. Individually removing outliers from 'any_outlier_col' columns

#### #5.1. Declaring function for outlier detection. 

In [None]:
def grubbs_test(x):
    n = len(x)
    mean_x = np.mean(x)
    sd_x = np.std(x)
    numerator = max(abs(x-mean_x))
    g_calculated = numerator/sd_x
    print("Grubbs Calculated Value:",g_calculated)
    t_value = stats.t.ppf(1 - 0.05 / (2 * n), n - 2)
    g_critical = ((n - 1) * np.sqrt(np.square(t_value))) / (np.sqrt(n) * np.sqrt(n - 2 + np.square(t_value)))
    print("Grubbs Critical Value:",g_critical)
    if g_critical > g_calculated:
        print("From grubbs_test we observe that calculated value is lesser than critical value, Accept null hypothesis and conclude that there is no outlier\n")
    else:
        print("From grubbs_test we observe that calculated value is greater than critical value, Reject null hypothesis and conclude that there is an outliers\n")

#### #5.2 . Removing Cubical Transform from Columns with Outliers  

In [None]:
for col in any_outlier_col:
    ecom_ram_tr[col] = (ecom_ram_tr[col]**(3.7))      

#### #5.3 . Removing Outlier from 'price' Column (Using Imputing)

In [None]:
ecom_ram_tr['price'].value_counts()

In [None]:
%matplotlib inline
ecom_ram_tr['price'].hist(bins = 50, figsize=(10,7)) 
plt.show()

In [None]:
for i in ecom_ram_tr['price']:
    if i >= 22:
        ecom_ram_tr['price'] = ecom_ram_tr['price'].replace(i, 22)
ecom_ram_tr['price'].value_counts()

In [None]:
grubbs_test(ecom_ram_tr['price'])

#### #5.4 . Removing Outlier from 'rating' Column (Using Imputing)

In [None]:
ecom_ram_tr['rating'].value_counts()

In [None]:
%matplotlib inline
ecom_ram_tr['rating'].hist(bins = 50, figsize=(10,7)) 
plt.show()

In [None]:
for i in ecom_ram_tr['rating']:
    if i <= 1.8:
        ecom_ram_tr['rating'] = ecom_ram_tr['rating'].replace(i, 1.8)
ecom_ram_tr['rating'].value_counts()

In [None]:
grubbs_test(ecom_ram_tr['rating'])

#### #5.5 . Removing Outlier from ''shipping_option_price'' Column (Using Imputing)

In [None]:
ecom_ram_tr['shipping_option_price'].value_counts()

In [None]:
%matplotlib inline
ecom_ram_tr['shipping_option_price'].hist(bins = 50, figsize=(10,7)) 
plt.show()

In [None]:
for i in ecom_ram_tr['shipping_option_price']:
    if i >= 7:
        ecom_ram_tr['shipping_option_price'] = ecom_ram_tr['shipping_option_price'].replace(i, 6)
ecom_ram_tr['shipping_option_price'].value_counts()

In [None]:
grubbs_test(ecom_ram_tr['shipping_option_price'])

#### #5.6 . Removing Outlier from 'merchant_rating_count' Column (Using Imputing)

In [None]:
ecom_ram_tr['merchant_rating_count'].value_counts()

In [None]:
%matplotlib inline
ecom_ram_tr['merchant_rating_count'].hist(bins = 50, figsize=(10,7)) 
plt.show()

In [None]:
for i in ecom_ram_tr['merchant_rating_count']:
    if i >= 160000:
        ecom_ram_tr['merchant_rating_count'] = ecom_ram_tr['merchant_rating_count'].replace(i, 160000)
ecom_ram_tr['merchant_rating_count'].value_counts()

In [None]:
grubbs_test(ecom_ram_tr['merchant_rating_count'])

#### #5.3. Removing Outlier from 'merchant_rating' Column (Using Imputing)

In [None]:
ecom_ram_tr['merchant_rating'].value_counts()

In [None]:
%matplotlib inline
ecom_ram_tr['merchant_rating'].hist(bins = 50, figsize=(10,7)) 
plt.show()

#### #5.5. Outlier Removal in 'merchant_rating' Column (Using Imputing)

In [None]:
for i in ecom_ram_tr['merchant_rating']:
    if i <= 3.3:
        ecom_ram_tr['merchant_rating'] = ecom_ram_tr['merchant_rating'].replace(i, 3.3)
    elif i >= 4.3:
        ecom_ram_tr['merchant_rating'] = ecom_ram_tr['merchant_rating'].replace(i, 4.3)
        
ecom_ram_tr['merchant_rating'].value_counts()

In [None]:
grubbs_test(ecom_ram_tr['merchant_rating'])

# TASK #3: Splitting Training and Testing Data 

### A.  Splitting the Dataframe to Train and Test Data 

#### #1. Splitting the data frame 

In [None]:
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 65)
for train_index, test_index in split.split(ecom_ram_tr, ecom_ram_tr['merchant_has_profile_picture']):
    train_set = ecom_ram_tr.loc[train_index]
    test_set = ecom_ram_tr.loc[test_index]
      
print(f"Rows in train set : {len(train_set)}\nRows in test set: {len(test_set)}\n")

###  B. Verifying Stratification 

#### #1. 'badges_count' 

In [None]:
train_set['badges_count'].value_counts()

In [None]:
test_set['badges_count'].value_counts()

#### #2. 'merchant_has_profile_picture'

In [None]:
train_set['merchant_has_profile_picture'].value_counts()

In [None]:
test_set['merchant_has_profile_picture'].value_counts()

###  C. Copying the training data and verifying correctness

In [None]:
ecom_ram_tr = train_set.copy()
ecom_ram_tr.shape

# TASK #4: Plotting Feature Correlations with 'units_sold'

### A. Finding Correlation of 'units_sold' with Other Features

In [None]:
corr_matrix = ecom_ram_tr.corr()
corr_matrix['units_sold'].sort_values()

### B. Plotting Correlation Among Top Correlated Features

In [None]:
attributes = ['units_sold', 'rating_count', 'product_id']
scatter_matrix(ecom_ram_tr[attributes], figsize = (12,8))
plt.plot()

In [None]:
ecom_ram_tr.plot(kind = "scatter", x = "units_sold", y="rating_count", alpha = 0.8)

##### **Observation #4.1:** 'units_sold' has strongest positive correlation with 'rating_count'. 'units_sold' has strongest negative correalation with 'product_id'. 

# TASK #5: Forming Training Set and Training Labels  

In [None]:
ecom_ram_tr = train_set.drop("units_sold", axis = 1) # Dropping feature from the training set
ecom_ram_labels = train_set["units_sold"].copy()     # Storing feature in labels variable

# TASK #6. Performing Feature Scaling  

In [None]:
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('std_scaler', StandardScaler()),
])

In [None]:
ecom_ram_num = my_pipeline.fit_transform(ecom_ram_tr)
ecom_ram_num

In [None]:
ecom_ram_num.shape

# Task #7: Selecting the Desired Model for the E-Commerce Sales

In [None]:
model_comparison = pd.DataFrame()

model_names = [ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor]
for model_name in model_names:
    model = model_name()
    model.fit(ecom_ram_num, ecom_ram_labels)
    
    # Evaluating the Model
    scores = cross_val_score(model, ecom_ram_num, ecom_ram_labels, scoring = "neg_mean_squared_error" ,cv=10)
    rmse_scores = np.sqrt(-scores)
    
    model_comparison = model_comparison.append({'model_name': model_name, 'mean_rmse': rmse_scores.mean(), 'rmse_std': rmse_scores.std()}, ignore_index = True)
model_comparison.sort_values(by = ['mean_rmse'], inplace = True) 
print(model_comparison)

##### **#Result:** The best result is obtained by using ExtraTreesRegressor model. It is the most suitable model for the given dataset.

# TASK #8: Implementing ExtraTreesRegressor and Saving the Model

In [None]:
model = ExtraTreesRegressor()
model.fit(ecom_ram_num, ecom_ram_labels)

In [None]:
from joblib import dump, load
dump(model, 'EcomRam.joblib')

# TASK #9: Testing the Model in Test Data

In [None]:
model = load('EcomRam.joblib')

In [None]:
X_test = test_set.drop('units_sold', axis=1)
Y_test = test_set['units_sold'].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)

# Evaluating Results
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
# print(final_predictions, list(Y_test))

# Final Result

In [None]:
print("Using ExtraTreesRegressor the model predicts units sold with MSE: {} and RMSE: {}.".format(final_mse, final_rmse))