In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [14]:
df=pd.read_csv("../../data/external/advertisement.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1000 non-null   int64  
 1   gender            1000 non-null   object 
 2   income            1000 non-null   float64
 3   education         1000 non-null   object 
 4   married           1000 non-null   bool   
 5   children          1000 non-null   int64  
 6   city              1000 non-null   object 
 7   occupation        1000 non-null   object 
 8   purchase_amount   1000 non-null   float64
 9   most bought item  1000 non-null   object 
 10  labels            1000 non-null   object 
dtypes: bool(1), float64(2), int64(2), object(6)
memory usage: 79.2+ KB


In [15]:
df.describe()

Unnamed: 0,age,income,children,purchase_amount
count,1000.0,1000.0,1000.0,1000.0
mean,40.836,49349.796167,1.508,101.09817
std,13.786848,9894.479148,1.129253,20.348736
min,18.0,21908.867759,0.0,23.482179
25%,28.0,42577.352034,0.0,87.699577
50%,41.0,48993.757137,1.0,101.58756
75%,53.0,56566.795992,3.0,114.718926
max,64.0,79459.294416,3.0,168.978628


In [16]:
labels_raw=np.array(df['labels'])
labels = []
for i in range(len(labels_raw)):
    words=labels_raw[i].split()
    labels.append(words)

labels


[['electronics', 'clothing', 'sports'],
 ['furniture', 'beauty'],
 ['clothing', 'electronics', 'food', 'sports'],
 ['food'],
 ['home'],
 ['sports', 'electronics', 'books'],
 ['beauty', 'furniture', 'clothing'],
 ['books', 'beauty'],
 ['electronics', 'food', 'home', 'sports'],
 ['furniture', 'food'],
 ['clothing', 'home'],
 ['food', 'books'],
 ['home'],
 ['sports', 'furniture', 'food', 'home'],
 ['beauty', 'clothing', 'food'],
 ['books', 'clothing', 'sports'],
 ['electronics', 'furniture', 'food', 'books'],
 ['furniture', 'electronics', 'home', 'sports'],
 ['clothing', 'food', 'beauty'],
 ['food', 'furniture'],
 ['home', 'food'],
 ['sports', 'clothing', 'home'],
 ['beauty', 'sports'],
 ['books', 'electronics', 'food'],
 ['electronics', 'books'],
 ['furniture', 'food', 'sports'],
 ['clothing', 'furniture', 'food', 'books'],
 ['food', 'home'],
 ['home', 'food', 'beauty'],
 ['sports', 'electronics', 'furniture', 'books'],
 ['beauty', 'clothing', 'home'],
 ['books', 'clothing', 'sports', 'b

In [17]:
features=df.drop(columns='labels')
cols=features.select_dtypes(include=['int','float']).columns
df[cols] = features[cols].apply(lambda x: (x - x.mean()) / x.std())
df

Unnamed: 0,age,gender,income,education,married,children,city,occupation,purchase_amount,most bought item,labels
0,0.302027,Male,1.204930,Master,False,1.321228,Lake Sheila,Doctor,-0.658569,monitor,electronics clothing sports
1,-1.221164,Female,0.392068,High School,False,-0.449855,Crystalburgh,Businessman,0.689842,lipstick,furniture beauty
2,0.302027,Female,-1.948940,Bachelor,True,1.321228,Margaretburgh,Engineer,0.029308,biscuits,clothing electronics food sports
3,-1.583828,Male,-0.040381,PhD,False,-1.335397,Williamshaven,Lawyer,-0.153979,maggi,food
4,-0.858499,Female,-0.460577,Master,False,-1.335397,New Paul,Businessman,-0.700333,carpet,home
...,...,...,...,...,...,...,...,...,...,...,...
995,1.462553,Male,-1.526409,Master,True,1.321228,Solisfurt,HR,0.111028,bed,food furniture
996,1.099889,Female,-0.389669,Bachelor,False,-1.335397,Dawsonmouth,Engineer,-0.783299,biscuits,home clothing food
997,1.680152,Female,1.959744,PhD,True,-1.335397,Lake Garyport,Salesman,0.016536,bat,sports clothing
998,-1.656361,Female,-0.505469,Bachelor,True,-1.335397,Ericfurt,Retired,-0.169454,perfume,beauty


In [25]:
flattened_list = [item for sublist in labels for item in sublist]
unique_list = list(set(flattened_list))
unique_list

['electronics',
 'beauty',
 'home',
 'food',
 'sports',
 'clothing',
 'books',
 'furniture']

In [19]:
cities, count=np.unique(df["most bought item"],return_counts=True)
print(cities)
print(count)

['ball' 'bat' 'bed' 'biscuits' 'carpet' 'chair' 'chips' 'cream' 'curtains'
 'dictionary' 'encyclopedia' 'gloves' 'laptop' 'lipstick' 'maggi' 'mobile'
 'monitor' 'novel' 'pants' 'perfume' 'shirt' 'shoes' 'sofa' 'table']
[50 49 35 32 51 35 40 37 34 46 36 47 36 51 42 42 35 49 48 44 47 47 25 42]


In [20]:
columns_to_encode = ['gender', 'education', 'married', 'city', 'occupation', 'most bought item']

# Initialize LabelEncoder
le = LabelEncoder()

# Apply LabelEncoder to each column
for col in columns_to_encode:
    df[col] = le.fit_transform(df[col])

In [21]:
unique_list

['electronics',
 'beauty',
 'home',
 'food',
 'sports',
 'clothing',
 'books',
 'furniture']

In [22]:
label_dict = {word: idx for idx, word in enumerate(unique_list)}

y_encoded=[]
for l in labels:
    y_encoded.append([label_dict[word] for word in l])

y_encoded

[[0, 5, 4],
 [7, 1],
 [5, 0, 3, 4],
 [3],
 [2],
 [4, 0, 6],
 [1, 7, 5],
 [6, 1],
 [0, 3, 2, 4],
 [7, 3],
 [5, 2],
 [3, 6],
 [2],
 [4, 7, 3, 2],
 [1, 5, 3],
 [6, 5, 4],
 [0, 7, 3, 6],
 [7, 0, 2, 4],
 [5, 3, 1],
 [3, 7],
 [2, 3],
 [4, 5, 2],
 [1, 4],
 [6, 0, 3],
 [0, 6],
 [7, 3, 4],
 [5, 7, 3, 6],
 [3, 2],
 [2, 3, 1],
 [4, 0, 7, 6],
 [1, 5, 2],
 [6, 5, 4, 1],
 [0, 7, 3],
 [7, 1],
 [5, 4, 6],
 [3, 0, 5, 6],
 [2, 4, 6],
 [4, 0, 7, 3],
 [1, 3, 2, 6],
 [6, 5, 4],
 [0, 5, 2, 4],
 [7, 5],
 [5, 1],
 [3, 7],
 [2, 0, 5, 6],
 [4, 7, 5],
 [1, 6],
 [6],
 [0, 1, 6],
 [7, 0, 6],
 [5, 3, 4, 1],
 [3, 2, 4],
 [2, 1],
 [4],
 [1, 7, 5, 2, 4],
 [6, 0],
 [0, 5, 3, 6],
 [7, 0, 5, 4],
 [5, 2],
 [3, 6],
 [2, 0, 1],
 [4, 7, 5, 3, 2, 1, 6],
 [1, 4],
 [6, 2],
 [0, 7],
 [7, 3],
 [5],
 [3, 7, 1],
 [2, 0, 7, 1, 6],
 [4, 0],
 [1, 6],
 [6, 0, 3, 2, 4],
 [0],
 [7, 3, 2],
 [5, 0, 3, 4, 1],
 [3, 7, 6],
 [2, 0, 7, 6],
 [4, 6],
 [1, 2],
 [6, 0, 7, 1],
 [0, 1],
 [7, 0],
 [5, 1],
 [3, 6],
 [2],
 [4, 2, 6],
 [1, 0, 2],
 [6],
 

In [23]:
num_labels=len(unique_list)
y_onehot=[]
for l in y_encoded:
    zeros=[0]*num_labels
    for i in l:
        zeros[i]=1
    y_onehot.append(zeros)

y_onehot

[[1, 0, 0, 0, 1, 1, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 1],
 [1, 0, 0, 1, 1, 1, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 1, 0, 1, 0],
 [0, 1, 0, 0, 0, 1, 0, 1],
 [0, 1, 0, 0, 0, 0, 1, 0],
 [1, 0, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 1],
 [0, 0, 1, 0, 0, 1, 0, 0],
 [0, 0, 0, 1, 0, 0, 1, 0],
 [0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 1, 1, 1, 0, 0, 1],
 [0, 1, 0, 1, 0, 1, 0, 0],
 [0, 0, 0, 0, 1, 1, 1, 0],
 [1, 0, 0, 1, 0, 0, 1, 1],
 [1, 0, 1, 0, 1, 0, 0, 1],
 [0, 1, 0, 1, 0, 1, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 1],
 [0, 0, 1, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 1, 1, 0, 0],
 [0, 1, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 1, 0, 0, 1, 0],
 [1, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 1, 1, 0, 0, 1],
 [0, 0, 0, 1, 0, 1, 1, 1],
 [0, 0, 1, 1, 0, 0, 0, 0],
 [0, 1, 1, 1, 0, 0, 0, 0],
 [1, 0, 0, 0, 1, 0, 1, 1],
 [0, 1, 1, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 1, 1, 1, 0],
 [1, 0, 0, 1, 0, 0, 0, 1],
 [0, 1, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1, 1, 1, 0],
 [1, 0, 0, 1, 0, 1, 1, 0],
 [0, 0, 1, 0, 1, 0, 1, 0],
 

In [24]:
df

Unnamed: 0,age,gender,income,education,married,children,city,occupation,purchase_amount,most bought item,labels
0,0.302027,1,1.204930,2,0,1.321228,366,2,-0.658569,16,electronics clothing sports
1,-1.221164,0,0.392068,1,0,-0.449855,98,1,0.689842,13,furniture beauty
2,0.302027,0,-1.948940,0,1,1.321228,403,3,0.029308,3,clothing electronics food sports
3,-1.583828,1,-0.040381,3,0,-1.335397,959,6,-0.153979,14,food
4,-0.858499,0,-0.460577,2,0,-1.335397,521,1,-0.700333,4,home
...,...,...,...,...,...,...,...,...,...,...,...
995,1.462553,1,-1.526409,2,1,1.321228,763,4,0.111028,2,food furniture
996,1.099889,0,-0.389669,0,0,-1.335397,111,3,-0.783299,3,home clothing food
997,1.680152,0,1.959744,3,1,-1.335397,341,8,0.016536,1,sports clothing
998,-1.656361,0,-0.505469,0,1,-1.335397,195,7,-0.169454,19,beauty
