In [1]:
import numpy as np
import pandas as pd

### Import dataset

In [2]:
df_raw = pd.read_csv('../Data/walmart_cleaned.csv')
df_raw.drop('Unnamed: 0', axis=1, inplace=True)
df_raw.head(5)

Unnamed: 0,Store,Date,IsHoliday,Dept,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1,2010-02-05,0,1.0,24924.5,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3,151315
1,1,2010-02-05,0,26.0,11737.12,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3,151315
2,1,2010-02-05,0,17.0,13223.76,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3,151315
3,1,2010-02-05,0,45.0,37.44,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3,151315
4,1,2010-02-05,0,28.0,1085.29,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3,151315


### pre-processing

In [3]:
print(df_raw.shape)
print(df_raw.isnull().sum())

df = df_raw.copy()

(421570, 16)
Store           0
Date            0
IsHoliday       0
Dept            0
Weekly_Sales    0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
Type            0
Size            0
dtype: int64


##### Basic stats

In [4]:
all_columns = df.columns
print(f'All columns: {all_columns}')

unwanted_columns = ['Store', 'Date', 'Dept']
feature_columns = [col for col in all_columns if col not in unwanted_columns]
print(f'Feature Columns: {feature_columns}')

df.loc[:, feature_columns].describe()

All columns: Index(['Store', 'Date', 'IsHoliday', 'Dept', 'Weekly_Sales', 'Temperature',
       'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4',
       'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size'],
      dtype='object')
Feature Columns: ['IsHoliday', 'Weekly_Sales', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size']


Unnamed: 0,IsHoliday,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
count,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0
mean,0.070358,15981.258123,60.090059,3.361027,2590.074819,879.974298,468.087665,1083.132268,1662.772385,171.201947,7.960289,2.410088,136727.915739
std,0.25575,22711.183519,18.447931,0.458515,6052.385934,5084.538801,5528.873453,3894.529945,4207.629321,39.159276,1.863296,0.666337,60980.583328
min,0.0,-4988.94,-2.06,2.472,0.0,-265.76,-29.1,0.0,0.0,126.064,3.879,1.0,34875.0
25%,0.0,2079.65,46.68,2.933,0.0,0.0,0.0,0.0,0.0,132.022667,6.891,2.0,93638.0
50%,0.0,7612.03,62.09,3.452,0.0,0.0,0.0,0.0,0.0,182.31878,7.866,3.0,140167.0
75%,0.0,20205.8525,74.28,3.738,2809.05,2.2,4.54,425.29,2168.04,212.416993,8.572,3.0,202505.0
max,1.0,693099.36,100.14,4.468,88646.76,104519.54,141630.61,67474.85,108519.28,227.232807,14.313,3.0,219622.0


In [5]:
print(df['Store'].nunique())
print(df['Size'].nunique())


45
40


##### Correlation

In [6]:
df_corr = df.loc[:, feature_columns].corr()
df_corr.style.applymap(lambda x: 'background-color: yellow' if x > 0.7 else '')

Unnamed: 0,IsHoliday,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
IsHoliday,1.0,0.012774,-0.155949,-0.078281,-0.003521,0.207604,0.266471,0.011565,-0.015235,-0.001944,0.01046,0.000798,0.000593
Weekly_Sales,0.012774,1.0,-0.002312,-0.00012,0.047172,0.020716,0.038562,0.037467,0.050465,-0.020921,-0.025864,0.182242,0.243828
Temperature,-0.155949,-0.002312,1.0,0.143859,-0.026415,-0.179672,-0.056026,-0.050281,-0.014752,0.182112,0.09673,-0.042981,-0.058313
Fuel_Price,-0.078281,-0.00012,0.143859,1.0,0.297056,0.029153,0.018615,0.166622,0.21542,-0.16421,-0.033853,-0.029687,0.003361
MarkDown1,-0.003521,0.047172,-0.026415,0.297056,1.0,0.174868,-0.014411,0.838904,0.41505,0.010915,-0.105168,0.126345,0.169788
MarkDown2,0.207604,0.020716,-0.179672,0.029153,0.174868,1.0,-0.00608,0.11325,0.131735,-0.003554,-0.041427,0.055318,0.078372
MarkDown3,0.266471,0.038562,-0.056026,0.018615,-0.014411,-0.00608,1.0,-0.01202,0.042471,-0.005839,-0.018078,0.024013,0.033641
MarkDown4,0.011565,0.037467,-0.050281,0.166622,0.838904,0.11325,-0.01202,1.0,0.30337,-0.002047,-0.076513,0.08977,0.127334
MarkDown5,-0.015235,0.050465,-0.014752,0.21542,0.41505,0.131735,0.042471,0.30337,1.0,0.067906,-0.120406,0.130125,0.153011
CPI,-0.001944,-0.020921,0.182112,-0.16421,0.010915,-0.003554,-0.005839,-0.002047,0.067906,1.0,-0.299953,0.065812,-0.003314


Correlation is there between Type and Size mainly because Type is derived from Size

### Split into train-test set

In [7]:
n = df.shape[0] // 10

df_population = df.loc[:, all_columns].copy()

df_test = df_population.sample(n=n)
df_train = df_population.drop(df_test.index)

df_train.reset_index(drop=True, inplace=True)

print(f'Train size: {df_train.shape}')
print(f'Test size: {df_test.shape}')

Train size: (379413, 16)
Test size: (42157, 16)


### Setup

In [8]:
from pycaret.anomaly import *
exp_ano = setup(df, normalize = True, session_id = 301)

Unnamed: 0,Description,Value
0,session_id,301
1,Original Data,"(421570, 16)"
2,Missing Values,False
3,Numeric Features,13
4,Categorical Features,2
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(421570, 35)"
9,CPU Jobs,-1


In [9]:
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pyod.models.cblof.CBLOF
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


### Create IForest and KNN models

In [10]:
iforest = create_model('iforest')


In [11]:
knn = create_model('knn')

### Assign and plot both models

In [12]:
assigned_if = assign_model(iforest)
assigned_knn = assign_model(knn)

In [13]:
assigned_if.head(5)
assigned_if['Anomaly'].value_counts()

0    400494
1     21076
Name: Anomaly, dtype: int64

In [14]:
assigned_knn.head(5)
assigned_knn['Anomaly'].value_counts()

0    400491
1     21079
Name: Anomaly, dtype: int64

### Predict with test set

In [None]:
predict_model(iforest, data=df_test)