## Data Preprocessing & Exploratory Data Analysis

In [151]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [152]:
df = pd.read_csv('/Users/shreyas/Desktop/Maersk/Maersk/data/train.csv')

In [153]:
df.head()

Unnamed: 0,ProductType,Manufacturer,Area Code,Sourcing Channel,Product Size,Product Type,Month of Sourcing,Sourcing Cost
0,NTM3,X1,A28,WHOLESALE,Large,Powder,May-21,10.16
1,NTM2,X1,A9,DIRECT,Large,Powder,Oct-20,134.28
2,NTM3,X2,A20,DIRECT,Large,Powder,Dec-20,12.46
3,NTM3,X1,A18,WHOLESALE,Small,Powder,Feb-21,107.22
4,NTM2,X1,A28,DIRECT,Large,Liquid,Nov-20,197.76


In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550176 entries, 0 to 550175
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ProductType        550176 non-null  object 
 1   Manufacturer       550176 non-null  object 
 2   Area Code          550176 non-null  object 
 3   Sourcing Channel   550176 non-null  object 
 4   Product Size       550176 non-null  object 
 5   Product Type       550176 non-null  object 
 6   Month of Sourcing  550176 non-null  object 
 7   Sourcing Cost      550176 non-null  float64
dtypes: float64(1), object(7)
memory usage: 33.6+ MB


In [155]:
df.nunique()

ProductType             3
Manufacturer            3
Area Code              45
Sourcing Channel        4
Product Size            3
Product Type            2
Month of Sourcing      11
Sourcing Cost        4529
dtype: int64

In [156]:
df.duplicated().sum()

542796

In [157]:
df_unique = df.drop_duplicates(keep='first')
df_unique.count()

ProductType          7380
Manufacturer         7380
Area Code            7380
Sourcing Channel     7380
Product Size         7380
Product Type         7380
Month of Sourcing    7380
Sourcing Cost        7380
dtype: int64

In [158]:
df.columns

Index(['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel',
       'Product Size', 'Product Type', 'Month of Sourcing', 'Sourcing Cost'],
      dtype='object')

## LabelEncoding columns

In [159]:
from sklearn.preprocessing import LabelEncoder
lenc = LabelEncoder()

In [160]:
df['ProdType_enc'] = lenc.fit_transform(df['ProductType'])

In [161]:
df['Manufacturer_enc'] = lenc.fit_transform(df['Manufacturer'])

In [162]:
df.rename(columns={'Area Code': 'AreaCode'}, inplace=True)
df['AreaCode_enc'] = lenc.fit_transform(df['AreaCode'])

In [163]:
df.rename(columns={'Sourcing Channel': 'SourcingChannel'}, inplace=True)
df['SrcChnl_enc'] = lenc.fit_transform(df['SourcingChannel'])

In [164]:
df.rename(columns={'Product Size': 'ProductSize'}, inplace=True)
df['ProdSize_enc'] = lenc.fit_transform(df['ProductSize'])

In [165]:
df.rename(columns={'Month of Sourcing': 'MonthofSourcing'}, inplace=True)
df['SrcMonth_enc'] = lenc.fit_transform(df['MonthofSourcing'])

In [166]:
df.rename(columns={'Sourcing Cost': 'SourcingCost'}, inplace=True)

In [167]:
df.columns

Index(['ProductType', 'Manufacturer', 'AreaCode', 'SourcingChannel',
       'ProductSize', 'Product Type', 'MonthofSourcing', 'SourcingCost',
       'ProdType_enc', 'Manufacturer_enc', 'AreaCode_enc', 'SrcChnl_enc',
       'ProdSize_enc', 'SrcMonth_enc'],
      dtype='object')

In [168]:
df.head()

Unnamed: 0,ProductType,Manufacturer,AreaCode,SourcingChannel,ProductSize,Product Type,MonthofSourcing,SourcingCost,ProdType_enc,Manufacturer_enc,AreaCode_enc,SrcChnl_enc,ProdSize_enc,SrcMonth_enc
0,NTM3,X1,A28,WHOLESALE,Large,Powder,May-21,10.16,2,0,19,3,1,7
1,NTM2,X1,A9,DIRECT,Large,Powder,Oct-20,134.28,1,0,44,0,1,9
2,NTM3,X2,A20,DIRECT,Large,Powder,Dec-20,12.46,2,1,12,0,1,2
3,NTM3,X1,A18,WHOLESALE,Small,Powder,Feb-21,107.22,2,0,9,3,2,3
4,NTM2,X1,A28,DIRECT,Large,Liquid,Nov-20,197.76,1,0,19,0,1,8


In [169]:
label_mappings = {}

for col in df.columns:
    if df[col].dtype == 'object':
        unique_values = df[col].unique()
        encoded_labels = lenc.fit_transform(unique_values)
        label_mappings[col] = []
        for i, j in zip(unique_values, encoded_labels):
            label_mappings[col].append([i, j])

print("Label Mappings:")
print(label_mappings)

Label Mappings:
{'ProductType': [['NTM3', 2], ['NTM2', 1], ['NTM1', 0]], 'Manufacturer': [['X1', 0], ['X2', 1], ['X3', 2]], 'AreaCode': [['A28', 19], ['A9', 44], ['A20', 12], ['A18', 9], ['A10', 1], ['A19', 10], ['A29', 20], ['A7', 42], ['A2', 11], ['A8', 43], ['A4', 32], ['A6', 41], ['A30', 22], ['A35', 27], ['A44', 37], ['A45', 38], ['A31', 23], ['A25', 17], ['A37', 29], ['A32', 24], ['A34', 26], ['A46', 39], ['A11', 2], ['A39', 31], ['A41', 34], ['A17', 8], ['A38', 30], ['A5', 40], ['A22', 14], ['A3', 21], ['A12', 3], ['A24', 16], ['A36', 28], ['A42', 35], ['A14', 5], ['A43', 36], ['A33', 25], ['A15', 6], ['A40', 33], ['A21', 13], ['A16', 7], ['A13', 4], ['A1', 0], ['A23', 15], ['A26', 18]], 'SourcingChannel': [['WHOLESALE', 3], ['DIRECT', 0], ['RETAIL', 2], ['ECOM', 1]], 'ProductSize': [['Large', 1], ['Small', 2], ['ExtraLarge', 0]], 'Product Type': [['Powder', 1], ['Liquid', 0]], 'MonthofSourcing': [['May-21', 7], ['Oct-20', 9], ['Dec-20', 2], ['Feb-21', 3], ['Nov-20', 8], ['Sep-2

In [173]:
import json
import pandas as pd

label_mappings = {
    key: [[val[0], int(val[1])] for val in values]
    for key, values in label_mappings.items()
}

filename = 'label_mapping.json'

with open(filename, 'w') as f:
    json.dump(label_mappings, f, indent=4)

print("Label Mapping Saved to:", filename)

Label Mapping Saved to: label_mapping.json


## Encoded Dataset

In [174]:
df_enc = df[['ProdType_enc', 'Manufacturer_enc', 'AreaCode_enc', 'SrcChnl_enc', 'ProdSize_enc', 'SrcMonth_enc', 'SourcingCost']]

In [175]:
df_enc.head()

Unnamed: 0,ProdType_enc,Manufacturer_enc,AreaCode_enc,SrcChnl_enc,ProdSize_enc,SrcMonth_enc,SourcingCost
0,2,0,19,3,1,7,10.16
1,1,0,44,0,1,9,134.28
2,2,1,12,0,1,2,12.46
3,2,0,9,3,2,3,107.22
4,1,0,19,0,1,8,197.76


In [176]:
df_enc.to_csv('df_enc.csv')