# IMPORTING LIBRARIES AND DATASETS

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
import cv2
import plotly.express as px
import tensorflow as tf
from tensorflow.python.keras import Sequential
from tensorflow.keras import layers, optimizers
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, LearningRateScheduler
from IPython.display import display
from tensorflow.keras import backend as K
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go

%matplotlib inline

In [None]:
sales_df = pd.read_csv('../input/sample-sales-data/sales_data_sample.csv', encoding = 'unicode_escape')

# Note: MSRP is the manufacturer's suggested retail price (MSRP) or sticker price represents the suggested retail price of products. 
# MSRP is used to standardize the price of products over multiple company store locations.

In [None]:
sales_df

In [None]:
# Let's view the types of data
sales_df.dtypes

In [None]:
# Convert order date to datetime format
sales_df['ORDERDATE'] = pd.to_datetime(sales_df['ORDERDATE'])

# Check the type of data
sales_df.dtypes

In [None]:
# Check the number of non-null values in the dataframe
sales_df.info()

In [None]:
# Check the number of Null values in the data
sales_df.isnull().sum()

In [None]:
# since there are lot of Null values in 'addressline2', 'state', 'postal code' and 'territory' we can drop them. 
# Country would represent the order grographical information.
# Also we can drop city, address1, phone number, contact_name, contact last_name and contact first_name since they are not required for the analysis

df_drop  = ['ADDRESSLINE1', 'ADDRESSLINE2', 'POSTALCODE', 'CITY', 'TERRITORY', 'PHONE', 'STATE', 'CONTACTFIRSTNAME', 'CONTACTLASTNAME', 'CUSTOMERNAME', 'ORDERNUMBER']
sales_df = sales_df.drop(df_drop, axis = 1)
sales_df.head()

In [None]:
sales_df.isnull().sum()

In [None]:
# Obtain the number of unique values in each column
sales_df.nunique()

# PERFORM EXPLORATORY DATA ANALYSIS AND DATA CLEANING

In [None]:
sales_df['COUNTRY'].value_counts().index

In [None]:
sales_df['COUNTRY'].value_counts()

In [None]:
# Function to visulize the count of items in a given column
# Note that Plotly is a Python graphing library that makes interactive, publication-quality graphs. 


def barplot_visualization(x):
  fig = plt.Figure(figsize = (12, 6))
  fig = px.bar(x = sales_df[x].value_counts().index, y = sales_df[x].value_counts(), color = sales_df[x].value_counts().index, height = 600)
  fig.show()

In [None]:
# Let's call this function for any given column such as 'COUNTRY'
barplot_visualization('COUNTRY')

In [None]:
# Pass status column to the function 
barplot_visualization('STATUS')

In [None]:
# Lets drop the status Column and save that to our original CSV File
sales_df.drop(columns= ['STATUS'], inplace = True)
sales_df

In [None]:
barplot_visualization('PRODUCTLINE')

In [None]:
barplot_visualization('DEALSIZE')

In [None]:
# Function to add dummy variables to replace categorical variables

def dummies(x):
  dummy = pd.get_dummies(sales_df[x])
  sales_df.drop(columns = x , inplace = True)
  return pd.concat([sales_df, dummy], axis = 1)

In [None]:
# Let's obtain dummy variables for the column 'COUNTRY'
sales_df = dummies('COUNTRY')
sales_df

In [None]:
sales_df = dummies('PRODUCTLINE')

In [None]:
sales_df

In [None]:
sales_df = dummies('DEALSIZE')
sales_df

In [None]:
y = pd.Categorical(sales_df['PRODUCTCODE'])

y

In [None]:
y = pd.Categorical(sales_df['PRODUCTCODE']).codes
y

In [None]:
# Since the number unique product code is 109, if we add one-hot variables, there 
# would be additional 109 columns, we can avoid that by using categorical encoding
# This is not the optimal way of dealing with it but it's important to avoid curse of dimensionality

sales_df['PRODUCTCODE'] = pd.Categorical(sales_df['PRODUCTCODE']).codes

In [None]:
sales_df

In [None]:
sales_df

In [None]:
# Group data by order date
sales_df_group = sales_df.groupby(by = "ORDERDATE").sum()
sales_df_group

We try to discover When doe the sales generally peak (which month)?

In [None]:
fig = px.line(x = sales_df_group.index, y = sales_df_group.SALES, title = 'Sales')
fig.show()

In [None]:
# We can drop 'ORDERDATE' and keep the rest of the date-related data such as 'MONTH'

sales_df.drop("ORDERDATE", axis = 1, inplace = True)
sales_df.shape

# Plot the correlation matrix between variables

In [None]:
plt.figure(figsize = (20, 20))
corr_matrix = sales_df.iloc[:, :10].corr()
sns.heatmap(corr_matrix, annot = True, cbar = False)

In [None]:
# It looks like the Quarter ID and the monthly IDs are highly correlated
# Let's drop 'QTR_ID' (or 'MONTH_ID') 
sales_df.drop("QTR_ID", axis = 1, inplace = True)
sales_df.shape

#### Let's plot distplots
 - Distplot shows the (1) histogram, (2) kde plot and (3) rug plot.
  1. Histogram: it's a graphical display of data using bars with various heights. Each bar groups numbers into ranges and taller bars show that more data falls in that range.
  2. Kde Plot: Kernel Density Estimate is used for visualizing the Probability Density of a continuous variable.
  3. Rug plot: plot of data for a single quantitative variable, displayed as marks along an axis (one-dimensional scatter plot).

In [None]:
import plotly.figure_factory as ff

plt.figure(figsize = (10, 10))

for i in range(8):
  # All Columns except OrderNumber
  if sales_df.columns[i] != 'ORDERLINENUMBER':
    fig = ff.create_distplot([sales_df[sales_df.columns[i]].apply(lambda x: float(x))], ['distplot'])
    fig.update_layout(title_text = sales_df.columns[i])
    fig.show()

In [None]:
# Visualize the relationship between variables using pairplots
plt.figure(figsize = (20, 20))

fig = px.scatter_matrix(sales_df,
    dimensions = sales_df.columns[:8], color = 'MONTH_ID')

fig.update_layout(
    title = 'Sales Data',
    width = 1300,
    height = 1300,
)
fig.show()

- A trend exists between 'SALES' and 'QUANTITYORDERED'  
- A trend exists between 'MSRP' and 'PRICEEACH'  
- A trend exists between 'PRICEEACH' and 'SALES'
- It seems that sales growth exists as we move from 2013 to 2014 to 2015 ('SALES' vs. 'YEAR_ID')
- zoom in into 'SALES' and 'QUANTITYORDERED', you will be able to see the monthly information color coded on the graph

In [None]:
sales_df

In [None]:
# Scale the data
scaler = StandardScaler()
sales_df_scaled = scaler.fit_transform(sales_df)

In [None]:
sales_df_scaled.shape

In [None]:
scores = []

range_values = range(1, 15)

for i in range_values:
  kmeans = KMeans(n_clusters = i)
  kmeans.fit(sales_df_scaled)
  scores.append(kmeans.inertia_) # intertia is the Sum of squared distances of samples to their closest cluster center

plt.plot(scores, 'bx-')
plt.title('Finding right number of clusters')
plt.xlabel('Clusters')
plt.ylabel('scores') 
plt.show()

- From this we can observe that, 5th cluster seems to be forming the elbow of the curve.
- Note that curve will change everytime we run the cell

# APPLY K-MEANS METHOD

In [None]:
# Cluster the data using k-means
kmeans = KMeans(5)
kmeans.fit(sales_df_scaled)
labels = kmeans.labels_

In [None]:
labels

In [None]:
kmeans.cluster_centers_.shape

In [None]:
# Let's take a look at the cluster centers 
cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns = [sales_df.columns])
cluster_centers 

In [None]:
# In order to understand what these numbers mean, let's perform inverse transformation

cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [sales_df.columns])
cluster_centers

In [None]:
labels.shape

In [None]:
labels.max()

In [None]:
labels.min()

In [None]:
y_kmeans = kmeans.fit_predict(sales_df_scaled)
y_kmeans

In [None]:
y_kmeans.shape

In [None]:
# Add a label (which cluster) corresponding to each data point
sale_df_cluster = pd.concat([sales_df, pd.DataFrame({'cluster':labels})], axis = 1)
sale_df_cluster

In [None]:
sales_df['ORDERLINENUMBER'] = sales_df['ORDERLINENUMBER'].apply(lambda x: float(x))

In [None]:
# plot histogram for each feature based on cluster 

for i in sales_df.columns[:8]:
  plt.figure(figsize = (30, 6))
  for j in range(5):
    plt.subplot(1, 5, j+1)
    cluster = sale_df_cluster[sale_df_cluster['cluster'] == j]
    cluster[i].hist()
    plt.title('{}    \nCluster - {} '.format(i,j))
  
  plt.show()

# APPLY PRINCIPAL COMPONENT ANALYSIS AND VISUALIZE THE RESULTS

In [None]:
# Reduce the original data to 3 dimensions using PCA for visualizig the clusters

pca = PCA(n_components = 2)
principal_comp = pca.fit_transform(sales_df_scaled)
principal_comp

In [None]:
pca_df = pd.DataFrame(data = principal_comp, columns = ['pca1', 'pca2'])
pca_df.head()

In [None]:
# Concatenate the clusters labels to the dataframe

pca_df = pd.concat([pca_df, pd.DataFrame({'cluster':labels})], axis = 1)
pca_df

In [None]:
# Visualize clusters using 3D-Scatterplot
fig = px.scatter_3d(pca_df, x = 'pca1', y = 'pca2', z = 'pca3', 
                    color = 'cluster', symbol = 'cluster', size_max = 18, opacity = 0.7)

fig.update_layout(margin = dict(l = 0, r = 0, b = 0, t = 0))

In [None]:
# Lets visualize it In 2D Dimension
ax = sns.scatterplot(x = 'pca1', y = 'pca2', hue= 'cluster', data = pca_df, palette =['red', 'green', 'blue','pink','yellow'] )

# APPLY AUTOENCODERS (DIMENSIONALITY REDUCTION USING AUTOENCODERS)

In [None]:
sales_df.shape

In [None]:
# from keras.optimizers import SGD

# Glorot Uniform initializer: https://keras.rstudio.com/reference/initializer_glorot_uniform.html

input_df = Input(shape = (37,))
x = Dense(50, activation = 'relu')(input_df)
x = Dense(500, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(500, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(2000, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)
encoded = Dense(8, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(2000, activation = 'relu', kernel_initializer = 'glorot_uniform')(encoded)
x = Dense(500, activation = 'relu', kernel_initializer = 'glorot_uniform')(x)
decoded = Dense(37, kernel_initializer = 'glorot_uniform')(x)

# autoencoder
autoencoder = Model(input_df, decoded)

# encoder - used for dimensionality reduction
encoder = Model(input_df, encoded)

autoencoder.compile(optimizer = 'adam', loss='mean_squared_error')

In [None]:
autoencoder.fit(sales_df, sales_df, batch_size = 128, epochs = 500, verbose = 3)

In [None]:
autoencoder.save_weights('autoencoder_1.h5')

In [None]:
pred = encoder.predict(sales_df_scaled)

In [None]:
scores = []

range_values = range(1, 15)

for i in range_values:
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(pred)
    scores.append(kmeans.inertia_)

plt.plot(scores, 'bx-')
plt.title('Finding right number of clusters')
plt.xlabel('Clusters')
plt.ylabel('scores') 
plt.show()

In [None]:
kmeans = KMeans(3)
kmeans.fit(pred)
labels = kmeans.labels_
y_kmeans = kmeans.fit_predict(sales_df_scaled)

In [None]:
df_cluster_dr = pd.concat([sales_df, pd.DataFrame({'cluster':labels})], axis = 1)
df_cluster_dr.head()

In [None]:
cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns = [sales_df.columns])
cluster_centers 

In [None]:
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [sales_df.columns])

cluster_centers

In [None]:

# plot histogram for each feature based on cluster 
for i in sales_df.columns[:8]:
   plt.figure(figsize = (30, 6))
   for j in range(3):
        plt.subplot(1, 3, j+1)
        cluster = df_cluster_dr[df_cluster_dr['cluster'] == j]
        cluster[i].hist()
        plt.title('{}    \nCluster - {} '.format(i,j))
  
        plt.show()

In [None]:
# Reduce the original data to 3 dimension using PCA for visualize the clusters
pca = PCA(n_components = 3)
prin_comp = pca.fit_transform(sales_df_scaled)
pca_df = pd.DataFrame(data = prin_comp, columns = ['pca1', 'pca2','pca3'])
pca_df.head()

In [None]:
pca_df = pd.concat([pca_df, pd.DataFrame({'cluster':labels})], axis = 1)
pca_df.head()

In [None]:
# Visualize clusters using 3D-Scatterplot
fig = px.scatter_3d(pca_df, x = 'pca1', y = 'pca2', z = 'pca3',
              color='cluster', symbol = 'cluster', size_max = 10, opacity = 0.7)
fig.update_layout(margin = dict(l = 0, r = 0, b = 0, t = 0))

In [None]:
# Lets visualize it In 2D Dimension
ax = sns.scatterplot(x = 'pca1', y = 'pca2', hue= 'cluster', data = pca_df, palette =['red', 'green', 'blue'] )