In [None]:
# Importing the necassary libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

In [2]:
# We create a directory where we specifically want to save clustering results

directory = "clustering"

if not os.path.exists(directory):
    # Create the directory
    os.mkdir(directory)
    print("Directory created successfully")
else:
    print("Directory already exists")

Directory already exists


In [None]:
# Load data
jobOrder_A = pd.read_csv("jobOrder_A.csv")
jobOrder_B = pd.read_csv("jobOrder_B.csv")

In [4]:
# Drop irrelevant columns for clustering
not_relevantAB = ["LAUNCH_DATE", "PRIMARY_KEY", "MATERIAL_CODE", "CodFam", "CodSubFam"]

jobOrder_A.drop(columns=not_relevantAB, inplace=True)
jobOrder_B.drop(columns=not_relevantAB, inplace=True)

In [5]:
# Additionally drop these 2 columns as they are not part of group A
not_relevantA = ["CYLINDER", "POWER_AXIS"]

jobOrder_A.drop(columns=not_relevantA, inplace=True)

In [6]:
# We setup our dbscan algorithm parameters
# Scale data to work better with algorithm
dbscan = DBSCAN(eps=0.4, min_samples=100)
scaler = StandardScaler()

In [7]:
jobOrder_A = scaler.fit_transform(jobOrder_A)
jobOrder_B = scaler.fit_transform(jobOrder_B)

In [8]:
# Fit clusters for group A
clusters = dbscan.fit_predict(jobOrder_A)

In [9]:
# Our final file needs to have the original data for the company to interpret, so we drop the encoded columns we created in the beginning for the clustering
# Ultimately, we add the cluster to wich each rows belongs to
jobOrder_A = pd.read_csv("jobOrder_A.csv")
not_relevantA = ["CYLINDER", "POWER_AXIS"]
jobOrder_A.drop(columns=not_relevantA, inplace=True)
not_relevantAB = ["category_material", "category_fam", "category_subfam"]
jobOrder_A.drop(columns=not_relevantAB, inplace=True)

jobOrder_A['Cluster'] = clusters
jobOrder_A.head()

Unnamed: 0,PRIMARY_KEY,MATERIAL_CODE,BASE_RADIUS,POWER,DIAMETER,CENTER_THK,LAUNCH_DATE,CodFam,CodSubFam,month,Cluster
0,FO8973_1_0,3201.0,8.0,-3.5,14.5,0.1,2020-01-02 10:07:00,1.0,5.0,1,0
1,FO9775_2_0,3201.0,8.3,-1.25,14.5,0.12,2020-01-02 05:57:00,1.0,5.0,1,1
2,FO9799_2_0,3201.0,8.3,-2.25,14.5,0.1,2020-01-02 05:25:00,1.0,5.0,1,1
3,FO9799_3_0,3201.0,8.3,-2.25,14.5,0.1,2020-01-02 05:57:00,1.0,5.0,1,1
4,FO9958_2_0,3201.0,8.9,-3.75,15.0,0.1,2020-01-02 09:25:00,1.0,5.0,1,2


In [10]:
jobOrder_A["Cluster"].value_counts()

-1      219630
 227     68883
 222     61494
 232     40211
 224     36506
         ...  
 292        64
 213        62
 365        54
 379        53
 373        30
Name: Cluster, Length: 383, dtype: int64

In [11]:
# Save results to find "made to stock" combinations
jobOrder_A.to_csv("clustering/clusterA_dbscan.csv", header=True)

In [12]:
# Fit clusters for group B
clusters = dbscan.fit_predict(jobOrder_B)

In [13]:
# Our final file needs to have the original data for the company to interpret, so we drop the encoded columns we created in the beginning for the clustering
# Ultimately, we add the cluster to wich each rows belongs to
jobOrder_B = pd.read_csv("jobOrder_B.csv")
not_relevantAB = ["category_material", "category_fam", "category_subfam"]
jobOrder_B.drop(columns=not_relevantAB, inplace=True)

jobOrder_B['Cluster'] = clusters
jobOrder_B.head()

Unnamed: 0,PRIMARY_KEY,MATERIAL_CODE,BASE_RADIUS,POWER,CYLINDER,POWER_AXIS,DIAMETER,CENTER_THK,LAUNCH_DATE,CodFam,CodSubFam,month,Cluster
0,CH6997_5_0,1001.0,8.7,9.0,-0.75,180.0,14.4,0.4268,2020-11-09 15:00:00,1.0,3.0,11,16
1,CH7603_9_0,1001.0,8.7,4.0,-3.25,50.0,14.4,0.3041,2020-11-09 15:00:00,1.0,3.0,11,2
2,CI1456_9_0,1001.0,8.7,0.5,-4.25,180.0,14.4,0.1901,2020-11-09 15:00:00,1.0,3.0,11,0
3,CI1456_A_0,1001.0,8.7,0.5,-4.25,180.0,14.4,0.1901,2020-11-09 15:00:00,1.0,3.0,11,0
4,CI1456_B_0,1001.0,8.7,0.5,-4.25,180.0,14.4,0.1901,2020-11-09 15:00:00,1.0,3.0,11,0


In [14]:
jobOrder_B["Cluster"].value_counts()

-1      1776380
 324     100843
 319      80187
 8        66335
 321      64484
         ...   
 427         31
 519         30
 516         29
 511         27
 402         14
Name: Cluster, Length: 588, dtype: int64

In [15]:
# Save results to find "made to stock" combinations
jobOrder_B.to_csv("clustering/clusterB_dbscan.csv", header=True)