## Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('/Users/shreyas/Desktop/Maersk/Maersk/data/train.csv')
test = pd.read_csv('/Users/shreyas/Desktop/Maersk/Maersk/data/test.csv')
df = pd.concat([train, test], axis=0)

In [3]:
df.head()

Unnamed: 0,ProductType,Manufacturer,Area Code,Sourcing Channel,Product Size,Product Type,Month of Sourcing,Sourcing Cost
0,NTM3,X1,A28,WHOLESALE,Large,Powder,May-21,10.16
1,NTM2,X1,A9,DIRECT,Large,Powder,Oct-20,134.28
2,NTM3,X2,A20,DIRECT,Large,Powder,Dec-20,12.46
3,NTM3,X1,A18,WHOLESALE,Small,Powder,Feb-21,107.22
4,NTM2,X1,A28,DIRECT,Large,Liquid,Nov-20,197.76


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 550272 entries, 0 to 95
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ProductType        550272 non-null  object 
 1   Manufacturer       550272 non-null  object 
 2   Area Code          550272 non-null  object 
 3   Sourcing Channel   550272 non-null  object 
 4   Product Size       550272 non-null  object 
 5   Product Type       550272 non-null  object 
 6   Month of Sourcing  550272 non-null  object 
 7   Sourcing Cost      550272 non-null  float64
dtypes: float64(1), object(7)
memory usage: 37.8+ MB


In [5]:
df.nunique()

ProductType             3
Manufacturer            3
Area Code              45
Sourcing Channel        4
Product Size            3
Product Type            2
Month of Sourcing      12
Sourcing Cost        4596
dtype: int64

In [6]:
df.duplicated().sum()

542796

In [7]:
df_unique = df.drop_duplicates(keep='first')
df_unique.count()

ProductType          7476
Manufacturer         7476
Area Code            7476
Sourcing Channel     7476
Product Size         7476
Product Type         7476
Month of Sourcing    7476
Sourcing Cost        7476
dtype: int64

In [8]:
df.columns

Index(['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel',
       'Product Size', 'Product Type', 'Month of Sourcing', 'Sourcing Cost'],
      dtype='object')

## LabelEncoding columns

In [9]:
from sklearn.preprocessing import LabelEncoder
lenc = LabelEncoder()

In [10]:
df['ProdType_enc'] = lenc.fit_transform(df['ProductType'])

In [11]:
df['Manufacturer_enc'] = lenc.fit_transform(df['Manufacturer'])

In [12]:
df.rename(columns={'Area Code': 'AreaCode'}, inplace=True)
df['AreaCode_enc'] = lenc.fit_transform(df['AreaCode'])

In [13]:
df.rename(columns={'Sourcing Channel': 'SourcingChannel'}, inplace=True)
df['SrcChnl_enc'] = lenc.fit_transform(df['SourcingChannel'])

In [14]:
df.rename(columns={'Product Size': 'ProductSize'}, inplace=True)
df['ProdSize_enc'] = lenc.fit_transform(df['ProductSize'])

In [15]:
df.rename(columns={'Month of Sourcing': 'MonthofSourcing'}, inplace=True)
month_mapping = {
    "Jul-20": 0,
    "Aug-20": 1,
    "Sep-20": 2,
    "Oct-20": 3,
    "Nov-20": 4,
    "Dec-20": 5,
    "Jan-21": 6,
    "Feb-21": 7,
    "Mar-21": 8,
    "Apr-21": 9,
    "May-21": 10,
    "Jun-21": 11
}

df['SrcMonth_enc'] = df['MonthofSourcing'].map(month_mapping)

In [16]:
df.rename(columns={'Sourcing Cost': 'SourcingCost'}, inplace=True)

In [17]:
df.columns

Index(['ProductType', 'Manufacturer', 'AreaCode', 'SourcingChannel',
       'ProductSize', 'Product Type', 'MonthofSourcing', 'SourcingCost',
       'ProdType_enc', 'Manufacturer_enc', 'AreaCode_enc', 'SrcChnl_enc',
       'ProdSize_enc', 'SrcMonth_enc'],
      dtype='object')

In [18]:
df.head()

Unnamed: 0,ProductType,Manufacturer,AreaCode,SourcingChannel,ProductSize,Product Type,MonthofSourcing,SourcingCost,ProdType_enc,Manufacturer_enc,AreaCode_enc,SrcChnl_enc,ProdSize_enc,SrcMonth_enc
0,NTM3,X1,A28,WHOLESALE,Large,Powder,May-21,10.16,2,0,19,3,1,10
1,NTM2,X1,A9,DIRECT,Large,Powder,Oct-20,134.28,1,0,44,0,1,3
2,NTM3,X2,A20,DIRECT,Large,Powder,Dec-20,12.46,2,1,12,0,1,5
3,NTM3,X1,A18,WHOLESALE,Small,Powder,Feb-21,107.22,2,0,9,3,2,7
4,NTM2,X1,A28,DIRECT,Large,Liquid,Nov-20,197.76,1,0,19,0,1,4


In [19]:
label_mappings = {}

for col in df.columns:
    if df[col].dtype == 'object':
        if col != 'MonthofSourcing':
            unique_values = df[col].unique()
            encoded_labels = lenc.fit_transform(unique_values)
        else:
            unique_values = ["Jul-20", "Aug-20", "Sep-20", "Oct-20", "Nov-20", "Dec-20", "Jan-21", "Feb-21", "Mar-21", "Apr-21", "May-21", "Jun-21"]
            encoded_labels = labels = list(range(len(unique_values)))
        label_mappings[col] = []
        for i, j in zip(unique_values, encoded_labels):
            label_mappings[col].append([i, j])

print("Label Mappings:")
print(label_mappings)

Label Mappings:
{'ProductType': [['NTM3', 2], ['NTM2', 1], ['NTM1', 0]], 'Manufacturer': [['X1', 0], ['X2', 1], ['X3', 2]], 'AreaCode': [['A28', 19], ['A9', 44], ['A20', 12], ['A18', 9], ['A10', 1], ['A19', 10], ['A29', 20], ['A7', 42], ['A2', 11], ['A8', 43], ['A4', 32], ['A6', 41], ['A30', 22], ['A35', 27], ['A44', 37], ['A45', 38], ['A31', 23], ['A25', 17], ['A37', 29], ['A32', 24], ['A34', 26], ['A46', 39], ['A11', 2], ['A39', 31], ['A41', 34], ['A17', 8], ['A38', 30], ['A5', 40], ['A22', 14], ['A3', 21], ['A12', 3], ['A24', 16], ['A36', 28], ['A42', 35], ['A14', 5], ['A43', 36], ['A33', 25], ['A15', 6], ['A40', 33], ['A21', 13], ['A16', 7], ['A13', 4], ['A1', 0], ['A23', 15], ['A26', 18]], 'SourcingChannel': [['WHOLESALE', 3], ['DIRECT', 0], ['RETAIL', 2], ['ECOM', 1]], 'ProductSize': [['Large', 1], ['Small', 2], ['ExtraLarge', 0]], 'Product Type': [['Powder', 1], ['Liquid', 0]], 'MonthofSourcing': [['Jul-20', 0], ['Aug-20', 1], ['Sep-20', 2], ['Oct-20', 3], ['Nov-20', 4], ['Dec-2

In [20]:
import json
import pandas as pd

label_mappings = {
    key: [[val[0], int(val[1])] for val in values]
    for key, values in label_mappings.items()
}

filename = 'label_mapping.json'

with open(filename, 'w') as f:
    json.dump(label_mappings, f, indent=4)

print("Label Mapping Saved to:", filename)

Label Mapping Saved to: label_mapping.json


## Encoded Dataset

In [21]:
df_enc = df[['ProdType_enc', 'Manufacturer_enc', 'AreaCode_enc', 'SrcChnl_enc', 'ProdSize_enc', 'SrcMonth_enc', 'SourcingCost']]

In [22]:
df_enc.head()

Unnamed: 0,ProdType_enc,Manufacturer_enc,AreaCode_enc,SrcChnl_enc,ProdSize_enc,SrcMonth_enc,SourcingCost
0,2,0,19,3,1,10,10.16
1,1,0,44,0,1,3,134.28
2,2,1,12,0,1,5,12.46
3,2,0,9,3,2,7,107.22
4,1,0,19,0,1,4,197.76


In [23]:
df_enc.to_csv('df_enc.csv', index=False)