# Imports


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# getting the path of the file
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# ignore warning for the distplot, which is deprecated, but allows for a better viz of the histogram and kde line

import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading the dataset

ecomm = pd.read_csv('/kaggle/input/ecommerce-customers/Ecommerce Customers.csv')
ecomm.head()

### Description of each column, according to the dataset source

* **Avg. Session Length**: Average session of in-store style advice sessions.
* **Time on App**: Average time spent on App in minutes
* **Time on Website**: Average time spent on Website in minutes
* **Length of Membership**: How many years the customer has been a member.

# Laying out the main ideas:
* Explore the dataset (EDA)
* Preprocessing and modeling
* Testing and evaluating the models
* Conclusion (insights)
* **Bonus**: Clustering with K-Means 

# Exploratory Data Analysis (EDA)

In [None]:
ecomm.info(memory_usage='deep')

In [None]:
# data types are OK. Confirming that there are no null values:

ecomm.isna().sum()

In [None]:
ecomm.describe()

In [None]:
# dropping the 'Avatar' column

ecomm = ecomm.drop('Avatar', axis=1)

# checking the 'Address' columns, it looks like some records do not have the state available

ecomm.Address.str.split(',',expand=True).sample(10)

In [None]:
# plotting for columns that have, for illustrative purposes only

ecomm['State'] = ecomm.Address.str.split(',', expand=True)[1].str.split(' ', expand=True)[1]
ecomm = ecomm[['Email', 'Address', 'State', 'Avg. Session Length', 'Time on App', 'Time on Website', 'Length of Membership', 'Yearly Amount Spent']] # rearranging columns
plt.figure(figsize=(10,8))
ecomm.groupby('State').sum()['Yearly Amount Spent'].sort_values(ascending=False)[1:10].plot(kind='bar', color='blue');
plt.title('Amount Spent by US State', loc='Left', fontsize=17, pad=20);

In [None]:
# relationships across the dataset

sns.set_style('whitegrid')
sns.pairplot(ecomm);

# side note: the dataset is already normally distributed (good sign for the regression analysis)
# length of membership seems to have the highest correlationw with the amount spent. Time on app apparently comes second.

In [None]:
# heatmap for corr analysis

plt.figure(figsize=(10, 8))
sns.heatmap(ecomm.corr(),annot=True, cmap='coolwarm');
plt.title('Correlation of the Dataset', loc='left', fontsize=15);

In [None]:
# length of membership vs. yearly amount spent

sns.regplot(data=ecomm, x='Length of Membership', y='Yearly Amount Spent', color='r');

In [None]:
# yearly amount spent vs time on website

sns.set_style('whitegrid')
sns.jointplot(x=ecomm['Time on Website'], y=ecomm['Yearly Amount Spent'], kind='hex');

# the lack of a strong relationship between the variables is quite apparent

sns.jointplot(kind='reg',x=ecomm['Time on Website'], y=ecomm['Yearly Amount Spent']);

In [None]:
# yearly amount spent vs time on app

sns.jointplot(x=ecomm['Time on App'], y=ecomm['Yearly Amount Spent'], color='g', kind='hex');

# much stronger correlation

sns.jointplot(kind='reg',x=ecomm['Time on App'], y=ecomm['Yearly Amount Spent'], color='g');

# Preprocessing and Modeling

In [None]:
from sklearn.model_selection import train_test_split

X = ecomm[['Avg. Session Length', 'Time on App','Time on Website', 'Length of Membership']]
y = ecomm['Yearly Amount Spent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

# printing the coefficients and score

print(f'Coefficients: {reg.coef_}. \nScore: {reg.score(X_train, y_train)}.')

# Predicting and Evaluating the Linear Model

In [None]:
# calculating the metrics - I'll use the MAE, MSE, RMSE. And also print the explained variance score (R^2) again, just so they're together.

from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
predictions = reg.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rsqrt = explained_variance_score(y_test, predictions)

print('MAE: {} \nMSE: {} \nRMSE: {} \nR-squared: {}'.format(mae, mse, rmse, rsqrt))

# the model explains approx. 98% of the variance, which is extremely accurate

In [None]:
reg.predict(X_test)

ax = sns.regplot(x=y_test, y=reg.predict(X_test),
                scatter_kws={'color': 'blue'}, line_kws={'color':'red'});
ax.set(xlabel='y test', ylabel='predicted y');

In [None]:
# sample of predicted vs. actual amount spent

samp = pd.DataFrame({'Prediction': predictions, 'Actual': y_test}, index=None)
samp.sample(10)

In [None]:
x_ax = range(len(X_test))
plt.figure(figsize=(18,8))
plt.plot(x_ax, y_test, lw=1, color='blue',label="Original")
plt.plot(x_ax, predictions, lw=0.8, color='red', label = "Predicted", marker="x", markersize=5)
plt.legend()
plt.title('Comparison between original and predicted amounts', loc='left', fontsize=15, pad=20);

# Residuals


In [None]:
sns.distplot(y_test - predictions);

# Conclusion and Insights of the Linear Model

In [None]:
coeff = pd.DataFrame(index=X.columns, columns=['Coefficients'], data=reg.coef_)
coeff

### Insights:

* Based on the coefficients, it is possible to predict - with a 98% accuracy, approx. - that, with all other features fixed, to **each unit increase in the time spent on app, there is a USD 38.10 increase in the 'Yearly Amount Spent'**.
* Considering that it would be significantly more difficult to increase the **'Avg Session Length'** (and it may even lead to a worse customer experience) and that the **'Length of Membership'** cannot be directly stimulated (only by focusing on customer service and retention) and only grows with time, **the company should focus on**:
##### * Further developing their mobile application, which should lead to an increase in the time customers spend on it.
##### * Creating a survey and performing a root cause analysis to better understand how to improve the website experience and why it is not leading to sales as well as their mobile app.

# Bonus: Clustering with K-Means

* The idea is to **find the best number of clusters** and separate them in a way that would allow for a different strategy to target each one.

In [None]:
from sklearn.cluster import KMeans
clus = X

# elbow method

c = []
for i in range(1, 10):
    km = KMeans(n_clusters=i, init='k-means++')
    km.fit(clus)
    c.append(km.inertia_)
plt.plot(range(1, 10), c);
plt.title('Elbow Method', loc='left', fontsize=15, pad=20);
plt.xlabel('Number of clusters');
plt.ylabel('CS');

In [None]:
# trying out with 5 different clusters

km = KMeans(n_clusters=5)
clus['Cluster'] = km.fit_predict(clus)
clus['Cluster'] = clus['Cluster'].astype('category')
clus.head(10)

In [None]:
sns.relplot(x='Length of Membership', y='Time on App', hue='Cluster', data=clus, 
            height=10, alpha=.8, s=100, palette='Paired');

In [None]:
clus['Yearly Amount Spent'] = ecomm['Yearly Amount Spent']
sns.boxplot(x='Yearly Amount Spent', y='Cluster', data=clus);

##### That is it for this one. Thank you for reading! :)
If there's anything you believe could be improved, please, leave a comment. :)