In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_excel('Online Retail.xlsx')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Changing date into pandas format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['InvoiceDate']

In [None]:
# Getting year month and day from the date
df['Invoice_Year'] = df.InvoiceDate.dt.year
df['Invoice_Month'] = df.InvoiceDate.dt.month
df['Invoice_Day'] = df.InvoiceDate.dt.day
df['Invoice_WeekDay'] = df.InvoiceDate.dt.weekday # return the day of week, start from Monday

In [None]:
# Creating total price by combining unit price and quantity
df['Total_Price'] = df['Quantity']*df['UnitPrice']
df.shape

In [None]:
df.head()

In [None]:
# getting data where quqntity and unity price is greater than 0
df = df[(df.Quantity>0) & (df.UnitPrice>0)]
# df[df['Total_Price']>0] # same thing
df.shape

In [None]:
X = df.drop(['InvoiceNo','StockCode','InvoiceDate','CustomerID'],axis=1) # these columns are of no use

In [None]:
X

# EDA 

In [None]:
# Checking Correlation between features
plt.figure(figsize=(9,5),dpi=150)
sns.heatmap(X.corr(),annot=True,cmap='viridis',vmin=-1,vmax=1)

Total price and quantity is correlated. Let's look in it

In [None]:
sns.scatterplot(data=X,x='Quantity',y='Total_Price')

# Numerical Features

In [None]:
numerical_features = X.select_dtypes(exclude='O').columns
print(numerical_features)

numerical_features_count = {features:len(df[features].unique()) for features in numerical_features}
numerical_features_count

## Discrete features

In [None]:
discrete_features = [features for features in numerical_features if len(df[features].unique())<10]
discrete_features

In [None]:
sns.barplot(x = 'Invoice_Year',y = 'Total_Price',data=X)

In [None]:
plot = sns.barplot(x = 'Invoice_WeekDay',y = 'Total_Price',data=X)
plot.set_xticklabels(['Monday','Tuesday','Wednesday','Thursday','Friday','Sunday'])
plt.show()

# Observation 

1. 2011 has more sales than 2010
2. On Friday we have more sales

# Continuous Variables

In [None]:
continous_features = [features for features in numerical_features if features not in discrete_features]
continous_features

In [None]:
plot = sns.barplot(x = 'Invoice_Month',y = 'Total_Price',data=X,ci=None)
plot.set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
plt.show()

In [None]:
plt.figure(figsize=(9,5),dpi=100)
sns.barplot(x='Invoice_Day',y = 'Total_Price',data = X,ci = None)

# Observation

1. In the month of December we have more sales
2. On 14 of month we have more sales

# Categorical Variables

In [None]:
categorical_features = X.select_dtypes(include='O').columns
print(categorical_features)

categorical_features_count = {features:len(X[features].unique()) for features in categorical_features}
categorical_features_count

In [None]:
plt.figure(figsize=(7,3),dpi=150)
sns.barplot(X.groupby('Description')['Total_Price'].mean().sort_values(ascending=False)[:10].index,
            X.groupby('Description')['Total_Price'].mean().sort_values(ascending=False)[:10].values,palette='Set2')
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(9,5),dpi=100)
sns.barplot(x = 'Country',y = 'Total_Price',data = X,ci=None)
plt.xticks(rotation=90)
plt.show()

# Observation

1. Neitherlands has highest sale
2. Paper craft little birdie is the bestseller

In [None]:
categorical_features_count

In [None]:
label_encoder = LabelEncoder()
X['Description'] = label_encoder.fit_transform(X['Description'].astype(str)) # because description has too many values
X

In [None]:
dummy = pd.get_dummies(df['Country'])
X = pd.concat([X,dummy],axis=1)
X.drop('Country',axis=1,inplace=True)
X

In [None]:
X.isnull().sum()

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled,columns=X.columns)
X_scaled

In [None]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1,20):
    model = KMeans(n_clusters=i,init='k-means++',random_state=42)
    model.fit(X_scaled)
    wcss.append(model.inertia_)

In [None]:
plt.plot(range(1,20),wcss,marker='o',mfc='r')

In [None]:
model = KMeans(n_clusters=5,random_state=101)
pred = model.fit_predict(X_scaled)

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(X_scaled,pred)