## Data Transformation Project

#### Objective

To transform categorical data to facilitate visualization and cluster analysis using k-modes clustering algorithm.

#### Method

Preprocess categorical variables using scikit-learn's LabelEncoder and merge transformed variables into existing dataset.

#### Datasource

Proprietary survey, n = 1,200
    
Variables for transformation: var9, var11, var12, var13, var16, var217, var230, var231, var232, var233, var234, var235, var236, var246

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [101]:
# Create dataframe
df = pd.read_csv("OpioidsMerged.csv")
df.head()

Unnamed: 0,Vrid,Vdatesub,Vstatus,Vcid,Vcomment,Vlanguage,Vreferer,Vsessionid,Vuseragent,Vip,...,var251rec,var252rec,var253rec,var254rec,var255rec,GEOID,HD01_VD01,ALAND_SQMI,HD,CT
0,17,3/20/2018,Complete,,,English,,1521528719_5ab0af8fe318c0.83350522,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,47.40.144.98,...,2,1,1,1,1,49048,10782.0,37.908,284.425451,1
1,18,3/20/2018,Complete,,,English,https://s.cint.com/Consent/Collect/9ed49688-a2...,1521528831_5ab0afff9dad82.74757954,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,24.99.168.150,...,1,1,2,1,1,30022,24854.0,25.444,976.811822,1
2,22,3/20/2018,Complete,,,English,,1521528941_5ab0b06d95d276.07495051,Mozilla/5.0 (Linux; Android 7.0; Moto G (4) Bu...,47.151.21.204,...,1,1,2,2,2,92683,28291.0,9.993,2831.081757,0
3,23,3/20/2018,Complete,,,English,https://s.cint.com/Consent/Collect/ed2140dc-95...,1521528964_5ab0b084821f35.43447059,Mozilla/5.0 (X11; CrOS x86_64 8872.73.0) Apple...,98.200.10.6,...,1,1,1,2,2,77036,30892.0,7.155,4317.540182,0
4,24,3/20/2018,Complete,,,English,https://s.cint.com/Consent/Collect/0529624f-1a...,1521528989_5ab0b09d6dc659.59506533,Mozilla/5.0 (iPhone; CPU iPhone OS 8_4 like Ma...,174.210.7.12,...,1,1,2,2,2,90026,26958.0,4.224,6382.102273,0


In [76]:
# View dataset shape
df.shape

(1074, 240)

In [100]:
# Select variables for transformation
ds = df[['Vrid','var9', 'var11', 'var12', 'var13', 'var16', 'var217', 'var230', 'var231', 'var232', 'var233', 'var234', 
        'var235', 'var236', 'var246']]
ds.head()

Unnamed: 0,Vrid,var9,var11,var12,var13,var16,var217,var230,var231,var232,var233,var234,var235,var236,var246
0,17,10001,10023,10026,10031,10052,10657,10723,10726,10729,10736,10739,10743,10749,10789
1,18,10006,10024,10027,10031,10055,10657,10724,10726,10729,10733,10740,10744,10745,10787
2,22,10001,10023,10025,10034,10056,10656,10724,10727,10730,10733,10740,10744,10749,10788
3,23,10004,10024,10025,10036,10052,10656,10724,10726,10730,10734,10740,10744,10749,10787
4,24,10001,10024,10029,10034,10056,10656,10724,10727,10730,10733,10740,10744,10749,10787


In [107]:
# View value counts of variables for transformation
ds.iloc[:, 1:].apply(pd.value_counts)

Unnamed: 0,var9,var11,var12,var13,var16,var217,var230,var231,var232,var233,var234,var235,var236,var246
10001,175.0,,,,,,,,,,,,,
10002,81.0,,,,,,,,,,,,,
10003,91.0,,,,,,,,,,,,,
10004,175.0,,,,,,,,,,,,,
10005,62.0,,,,,,,,,,,,,
10006,94.0,,,,,,,,,,,,,
10007,88.0,,,,,,,,,,,,,
10008,101.0,,,,,,,,,,,,,
10009,104.0,,,,,,,,,,,,,
10010,103.0,,,,,,,,,,,,,


In [99]:
# Create new dataframe with transformed variables
dsr = 0
for i in ds:
    if i=="Vrid":
        dsr=ds
    if i != "Vrid":
        dsr= ds.apply(preprocessing.LabelEncoder().fit_transform)
dsr.head()

Unnamed: 0,Vrid,var9,var11,var12,var13,var16,var217,var230,var231,var232,var233,var234,var235,var236,var246
0,0,0,0,1,0,0,1,1,0,0,3,2,2,4,2
1,1,5,1,2,0,3,1,2,0,0,0,3,3,0,0
2,2,0,0,0,3,4,0,2,1,1,0,3,3,4,1
3,3,3,1,0,5,0,0,2,0,1,1,3,3,4,0
4,4,0,1,4,3,4,0,2,1,1,0,3,3,4,0


In [109]:
# View value counts of transformed variables
dsr.iloc[:, 1:].apply(pd.value_counts)

Unnamed: 0,var9,var11,var12,var13,var16,var217,var230,var231,var232,var233,var234,var235,var236,var246
0,175,526.0,54.0,754.0,571.0,274.0,175.0,541.0,364.0,592.0,116.0,103.0,144.0,603.0
1,81,548.0,280.0,100.0,149.0,588.0,389.0,447.0,558.0,330.0,291.0,293.0,145.0,346.0
2,91,,234.0,119.0,154.0,113.0,325.0,86.0,124.0,114.0,54.0,52.0,66.0,125.0
3,175,,128.0,58.0,65.0,12.0,185.0,,28.0,38.0,613.0,626.0,217.0,
4,62,,229.0,12.0,71.0,38.0,,,,,,,323.0,
5,94,,149.0,31.0,22.0,49.0,,,,,,,179.0,
6,88,,,,42.0,,,,,,,,,
7,101,,,,,,,,,,,,,
8,104,,,,,,,,,,,,,
9,103,,,,,,,,,,,,,


In [71]:
# Validate transformation by checking that each transformed variable contains the right number of values and the same variance
for i in ds:
    count = ds[i].value_counts()
    rcount = dsr[i].value_counts()
    if len(count) == len(rcount) and np.var(count) == np.var(rcount):
        print(i, "True")
    else:
        print(i, "False")

Vrid True
var9 True
var11 True
var12 True
var13 True
var16 True
var217 True
var230 True
var231 True
var232 True
var233 True
var234 True
var235 True
var236 True
var246 True


In [72]:
# Merge transformed variables with original dataset based on Vrid
df_merged = pd.merge(df, dsr, how='left', on='Vrid')
df_merged.head()

Unnamed: 0,Vrid,Vdatesub,Vstatus,Vcid,Vcomment,Vlanguage,Vreferer,Vsessionid,Vuseragent,Vip,...,var16_y,var217_y,var230_y,var231_y,var232_y,var233_y,var234_y,var235_y,var236_y,var246_y
0,17,3/20/2018,Complete,,,English,,1521528719_5ab0af8fe318c0.83350522,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,47.40.144.98,...,0.0,1.0,2.0,2.0,1.0,1.0,3.0,3.0,0.0,0.0
1,18,3/20/2018,Complete,,,English,https://s.cint.com/Consent/Collect/9ed49688-a2...,1521528831_5ab0afff9dad82.74757954,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,24.99.168.150,...,1.0,0.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0
2,22,3/20/2018,Complete,,,English,,1521528941_5ab0b06d95d276.07495051,Mozilla/5.0 (Linux; Android 7.0; Moto G (4) Bu...,47.151.21.204,...,0.0,1.0,2.0,1.0,1.0,1.0,3.0,3.0,4.0,0.0
3,23,3/20/2018,Complete,,,English,https://s.cint.com/Consent/Collect/ed2140dc-95...,1521528964_5ab0b084821f35.43447059,Mozilla/5.0 (X11; CrOS x86_64 8872.73.0) Apple...,98.200.10.6,...,0.0,1.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0
4,24,3/20/2018,Complete,,,English,https://s.cint.com/Consent/Collect/0529624f-1a...,1521528989_5ab0b09d6dc659.59506533,Mozilla/5.0 (iPhone; CPU iPhone OS 8_4 like Ma...,174.210.7.12,...,0.0,1.0,1.0,0.0,1.0,0.0,2.0,2.0,1.0,0.0


In [103]:
# List new variables
df_merged.iloc[:, -(len(dsr.columns)-1):].columns.tolist()

['var9_y',
 'var11_y',
 'var12_y',
 'var13_y',
 'var16_y',
 'var217_y',
 'var230_y',
 'var231_y',
 'var232_y',
 'var233_y',
 'var234_y',
 'var235_y',
 'var236_y',
 'var246_y']

In [91]:
# Validate merge by checking shape of merged dataset
if (len(df_merged) == len(df)) and (len(df_merged.columns) == len(df.columns)+len(dsr.columns)-1):
    print("True")
else:
    print("False", df.shape, df_merged.shape)

True


In [90]:
# Save df_merged as a new csv file
df_merged.to_csv("OpioidsRecodes.csv", index=False)