Goal : Find out where the company should put their focus, either on Mobile App or on Website.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Basic Imports

In [None]:
# Libraries for data analysis
import numpy as np
import pandas as pd
# Libraries for Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
# For ignoring warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/kaggle/input/ecommerce-customers/Ecommerce Customers.csv')
data.head()

# EDA

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
df = data.copy()  # Making the copy of the dataset

# Extracting the postal  code from the 'Address' feature
postal_code = []
for i in range(0, len(df.Address)):
    try:
        post = df.Address[i].split(', ')[1]
    except:
        post = 'None'
    postal_code.append(post.split(' ')[0])

df['Postal Code'] = pd.Series(postal_code)

# Extracting the email site from the 'Email' feture
df['Email Site'] = df.Email.apply(lambda x: x.split('@')[1].split('.')[0])
df['Email Site'] = df['Email Site'].apply(lambda x: 'others' if x not in ['gmail', 'hotmail', 'yahoo'] else x)
df.drop(['Email', 'Address'], axis=1, inplace=True)

df.head()

In [None]:
df.hist(bins=30, edgecolor='black', figsize=(10,8))
plt.show()

All the numeric data looks normaly distributed.

In [None]:
sns.pairplot(df)
plt.show()

'Time on App' and 'Length of Memebership' look most correlated with 'Yearly Amount Spent'

In [None]:
matrix = np.triu(df.corr())
sns.heatmap(df.corr(), annot=True, mask=matrix)
plt.show()

In [None]:
# Cheking the correlation of the 'Email Site' feature with the 'Yearly Amount Spent'
fig, axs = plt.subplots(ncols=3,figsize=(20,8))
sns.barplot(x='Email Site', y='Yearly Amount Spent', data=df, ax=axs[0])
sns.boxplot(x='Email Site', y='Yearly Amount Spent', data=df, ax=axs[1])
sns.violinplot(x='Email Site', y='Yearly Amount Spent', data=df, ax=axs[2])
plt.show()

The 'Yearly Amount Spent' looks same across all the values of 'Email Spent'. This feature doesn't look helpful.

In [None]:
# Cheking the correlation between 'Postal Code' and 'Yearly Amount Spent'
plt.figure(figsize=(25,8))
sns.barplot(x='Postal Code', y='Yearly Amount Spent', data=df)
plt.show()

In [None]:
# Cheking the correlation between 'Postal Code' and 'Yearly Amount Spent' in the form of boxplot
plt.figure(figsize=(25,8))
sns.boxplot(x='Postal Code', y='Yearly Amount Spent', data=df)
plt.show()

there are some variations in correlation but it will not help us because of the large number of values in 'Postal Code'

In [None]:
# Checking wheater our target is normally distributed or not.
from scipy import stats
fig, axs = plt.subplots(ncols=2, figsize=(10,5))
sns.distplot(df['Yearly Amount Spent'], ax=axs[0])
res = stats.probplot(df['Yearly Amount Spent'], plot=plt)
plt.show()

It seems like our target is normally distributed.

# Preprocessing for ML models

In [None]:
# Handling the categorical values and also dropping the 'Avatar' columns because it has no use in our modelling.
df = pd.get_dummies(df.drop('Avatar', axis=1), drop_first=True)
df.head()

# Model Building

In [None]:
X = df.drop('Yearly Amount Spent', axis=1)
y = df['Yearly Amount Spent']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, KFold

# Choosing the best model among the tree based models
kfold = KFold(n_splits=4, shuffle=True, random_state=42)
scores = []
for i in [DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor()]:
    scores.append(np.mean(cross_val_score(i, X, y, cv=kfold)))
    
print(pd.DataFrame({'model':['Decision Tree','Random Forest', 'Gradient Boost'], 'score':scores}))

In [None]:
# Training both linear and tree based model
gboost = GradientBoostingRegressor().fit(X, y)
lr = LinearRegression().fit(X, y)

# Getting the coef for relevent features
coef = list(lr.coef_[:4]) + list(lr.coef_[-3:])
# Getting the feature importance for relevent features
fimp = list(gboost.feature_importances_[:4]) + list(gboost.feature_importances_[-3:])
# Getting the name of relevent features
feature_name = list(X.columns[:4]) + list(X.columns[-3:])

# Plotting the coef and feature importance
fig, axs = plt.subplots(ncols=2, figsize=(22,5))
axs[0].barh(feature_name, coef)
axs[0].set_title('Linear Regression Coef')

axs[1].barh(feature_name, fimp)
axs[1].set_title('Gradient Boost Feature Importance')
plt.show()

Conclusion : Both models and our data analysis shows thant 'Mobile App' generates more sales than 'Websites'. So company should enhance there user exp. on the website.