### Importing Necessary Modules

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

### Reading the csv file

In [None]:
product = pd.read_csv('/kaggle/input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv')
uni_cat = pd.read_csv('/kaggle/input/summer-products-and-sales-in-ecommerce-wish/unique-categories.csv')
uni_cat_sort = pd.read_csv('/kaggle/input/summer-products-and-sales-in-ecommerce-wish/unique-categories.sorted-by-count.csv')

In [None]:
product.head()

### Checking the basic info like shape of the DS and numerical parameters

In [None]:
product.shape

In [None]:
product.describe()

In [None]:
product.info()

### Checking Missing values before starting EDA analysis

In [None]:
product.isnull().sum()

### Dropping Unwanted Columns and Columns having high number of Missing values for further Analysis

In [None]:
product.head(2)

In [None]:
col_drop = ['title','title_orig','currency_buyer','rating_five_count', 'rating_four_count','rating_three_count',
            'rating_two_count', 'rating_one_count','has_urgency_banner', 'urgency_text', 'merchant_id',
            'merchant_has_profile_picture','merchant_profile_picture', 'product_url','product_picture','product_id',
            'theme', 'crawl_month']

product.drop(columns = col_drop, axis = 1, inplace = True)

product.isnull().sum()

### Deep Diving into columns having lesser number of missing values to decide whether to drop them or impute then with mean, median or mode.

In [None]:
100*product.isnull().sum()/product.shape[0]

### Immputing the missing values

In [None]:
product.product_color.fillna(product.product_color.mode()[0], inplace = True)

In [None]:
product.origin_country.fillna(product.origin_country.mode()[0], inplace = True)

### Now lets analyze each column - Visualization and cleaning by removing outliers

In [None]:
product.head()

In [None]:
plt.figure(figsize=(20,6))
plt.subplot(1,2,1)
sns.boxplot(data=product, x='price')
plt.subplot(1,2,2)
sns.distplot(product.price, bins=15)
plt.show()

In [None]:
plt.figure(figsize=(20,6))
plt.subplot(1,2,1)
sns.boxplot(data=product, x='retail_price')
plt.subplot(1,2,2)
sns.distplot(product.retail_price, bins=15)
plt.show()

### Creating a new columns which will tell us whether a product is sold at a profit or a loss

- The reason I am considering price as cost price and retail_price as selling price because of the basic rule of business which is buy at less and sell at more. We will notice some products which have a less selling price and more cost price which can be due to the fact that either the seller wants to clear the stock, or the seller is trying to attract consumers by offering few products by selling them at loss and there can be many more reasons.

In [None]:
product['Profit_Loss'] = product.retail_price - product.price

In [None]:
product['revenue'] = product.Profit_Loss*product.units_sold

- This gives us the list of top 10 products which have generated the maximum revenue for the company

In [None]:
maxrevenue = product.sort_values('revenue', ascending = False).head(10)
maxrevenue

- Similarly we can generate a list of Top 10 loss making product and consider removing them from the list to reduce the losses.

In [None]:
bottom10loss = product.sort_values('revenue').head(10)
bottom10loss

In [None]:
top10rating = product.sort_values('rating', ascending = False).head(10)
top10rating

In [None]:
bottom10rating = product.sort_values('rating').head(10)
bottom10rating

### Creating 2 seperate Dataframes for Profit making and Loss making products

In [None]:
df_profit = product[product.revenue>=0].sort_values('revenue', ascending = False)
df_loss = product[product.revenue<0].sort_values('revenue', ascending = False)

In [None]:
df_profit.head()

In [None]:
df_loss.sort_values('Profit_Loss')

In [None]:
print(df_profit.shape)
print(df_loss.shape)

In [None]:
loss_rating = df_loss.sort_values('rating', ascending = False).head(20)

In [None]:
loss_rating.sort_values('revenue').head()

- Above is the list of top 5 loss making high rated products. As observed high rated products do not make high losses. We will now check the profit making products having rating both high and low.

In [None]:
df_profit.revenue.describe()

In [None]:
df_profit.revenue

In [None]:
bins = [-1,100.00,1000,10000,5000000]
label = ['Low', 'Medium', 'High', 'Very_High']

df_profit['revenue_cat'] = pd.cut(df_profit.revenue, bins = bins, labels = label)

In [None]:
sns.boxplot(data=df_profit, x='revenue_cat', y='rating')
plt.show()

In [None]:
sns.boxplot(data=df_profit, x='revenue_cat', y='units_sold')
plt.show()

In [None]:
sns.boxplot(data=product, x='uses_ad_boosts', y='units_sold')
plt.show()

In [None]:
df1 = product[['units_sold', 'uses_ad_boosts']][product.units_sold<20000]

In [None]:
df1.uses_ad_boosts.value_counts()

In [None]:
sns.boxplot(data=df1, x='uses_ad_boosts', y='units_sold')
plt.show()

- As we can see above using ad boosts doesnt really impacts the boost in unit sale.

In [None]:
sns.distplot(product['rating'])
plt.show()

- Clearly the ratings for the products are distributed around 4, however there are a few products with a 5 rating as well

In [None]:
product.columns

In [None]:
corr_col = product[['price', 'retail_price', 'units_sold', 'uses_ad_boosts', 'rating',
       'rating_count','badges_count','badge_local_product', 'badge_product_quality',
       'badge_fast_shipping','product_variation_inventory',
       'shipping_option_price', 'shipping_is_express', 'countries_shipped_to',
       'inventory_total', 'merchant_rating_count',
       'merchant_rating']]

In [None]:
var = product[['price', 'retail_price', 'units_sold', 'rating', 'merchant_rating']]

In [None]:
sns.pairplot(var)
plt.show()

In [None]:
sns.scatterplot(data=product, x='rating',y='rating_count')
plt.show()

In [None]:
sns.scatterplot(data=product, x='rating',y='merchant_rating')
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr_col.corr(), annot= True, cmap='YlGnBu')
plt.show()

In [None]:
corr_col.drop(columns=['rating_count', 'badge_product_quality'], axis=1, inplace = True)

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr_col.corr(), annot= True, cmap='YlGnBu')
plt.show()

- From the above heatmap we can see that price of the product is highly correlated with shipping_option_price, retail_price and product_variation_inventory. Hence we will build a linear regression model to predict the price using these variables.

In [None]:
final_vars = product[['price','shipping_option_price','retail_price', 'product_variation_inventory']]

In [None]:
X = final_vars[['shipping_option_price','retail_price', 'product_variation_inventory']]
y = final_vars['price']

In [None]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train, y_test = train_test_split(X , y , train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
import statsmodels.api as sm

X_train_sm = sm.add_constant(X_train)

lr = sm.OLS(y_train, X_train_sm).fit()

lr.params

In [None]:
print(lr.summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_test_sm = sm.add_constant(X_test)

y_test_pred = lr.predict(X_test_sm)

In [None]:
y_train_price = lr.predict(X_train_sm)

In [None]:
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)  

In [None]:
print(lr.summary())