In [1]:
import pandas as pd
import numpy as np

In [6]:
data = pd.read_csv("mercedes.csv",header=0,usecols=["X1","X2","X3","X4","X5","X6"])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [9]:
# finding out the unique levels/categories in each columns
for col in data.columns:
    print(col," : ",len(data[col].unique()),"levels")

X1  :  27 levels
X2  :  44 levels
X3  :  7 levels
X4  :  4 levels
X5  :  29 levels
X6  :  12 levels


In [10]:
# now we will create dummy variables and see the no. of columns generated after one hot encoding
pd.get_dummies(data,drop_first=True).shape

(4209, 117)

In [11]:
# ^^ from the above we got 4209 rows and 117 columns
# Since we got 117 columns which is a huge so we try to take the top 10 frequent occuring level/category from each column.
# This will reduce the number of columns 

In [17]:
# we will try to take the top 10 most occuring levels/category from X2 column
top_10 = [x for x in data.X2.value_counts().head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [22]:
for label in top_10:
    data[label] = np.where(data["X2"]==label,1,0)

In [25]:
data[["X2"]+top_10].head(10)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [None]:
## we got ['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e'] as binary vairables after one hot encoding

In [26]:
# Now we will do the same thing for X1,X2,X3,X4,X5,X6 variables and apply one hot encoding

In [39]:
# find top 10 categories from each column
def find_top_10(df,col_name):
    top_10 = [x for x in df[col_name].value_counts().head(10).index]
    return top_10    

In [40]:
# creating dummy variables
def create_dummy(df,col_name,top_10):
    for label in top_10:
        df[col_name+'_'+label] = np.where(df[col_name]==label,1,0)
    return df    

In [41]:
data = pd.read_csv("mercedes.csv",header=0,usecols=["X1","X2","X3","X4","X5","X6"])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [42]:
top_10 = find_top_10(data,"X1")

In [43]:
#creating dummy for X2
create_dummy(data,"X1",top_10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,1
4206,v,r,a,d,aa,g,0,0,0,0,1,0,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,0,1,0,0,0,0


In [44]:
top_10 = find_top_10(data,"X2")
create_dummy(data,"X2",top_10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [45]:
top_10 = find_top_10(data,"X3")
create_dummy(data,"X3",top_10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X2_s,X2_f,X2_e,X3_c,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [46]:
top_10 = find_top_10(data,"X4")
create_dummy(data,"X4",top_10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b,X4_d,X4_a,X4_b,X4_c
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [47]:
top_10 = find_top_10(data,"X5")
create_dummy(data,"X5",top_10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X5_w,X5_v,X5_q,X5_r,X5_d,X5_s,X5_n,X5_p,X5_m,X5_i
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
top_10 = find_top_10(data,"X6")
create_dummy(data,"X6",top_10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [49]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
# from 117 we reduced it to 57 columns by using the top 10 catgegories from each variable
# we can drop the X1....X6 columns

In [50]:
data.drop(["X1","X2","X3","X4","X5","X6"],axis=1)

Unnamed: 0,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4205,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4206,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4207,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
