In [1]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


RANDOM_STATE = 42

Using TensorFlow backend.


In [2]:
#!pip install imblearn

In [3]:
df = pd.read_csv("original_data/trialPromoResults.csv")
df = df.drop(["index"], axis=1)

In [4]:

print('Original dataset shape {0}, {1}'.format(df.shape[0], df.shape[1]))

Original dataset shape 1000, 10


In [5]:
df.head()
headers = df.columns.values
headers

array([' sex', ' mstatus', ' age', ' children', ' occupation',
       ' education', ' income', ' avbal', ' avtrans', ' decision'],
      dtype=object)

In [6]:
# strip the spaces in the header
df = df.rename(columns=lambda x: x.strip())
df.columns.values

array(['sex', 'mstatus', 'age', 'children', 'occupation', 'education',
       'income', 'avbal', 'avtrans', 'decision'], dtype=object)

### Perform one-hot encoding

In [7]:
categorical_features = ["sex", "mstatus", "occupation", "education", "children"]
df[categorical_features].head()
df["children"] = df["children"].astype(str)

In [8]:
label_encoders = {}
label_mappings = {}
for categorical_feature in categorical_features:
    label_encoders[categorical_feature] = preprocessing.LabelEncoder()
    df[categorical_feature + "Num"] = label_encoders[categorical_feature].fit_transform(df[categorical_feature])
    label_mappings[categorical_feature] = label_encoders[categorical_feature].classes_
df.head()

Unnamed: 0,sex,mstatus,age,children,occupation,education,income,avbal,avtrans,decision,sexNum,mstatusNum,occupationNum,educationNum,childrenNum
0,F,married,56.82,1,legal,secondary,3105.39,33003.48,1776.81,,0,1,5,2,1
1,M,widowed,87.35,3,retired,tertiary,4874.08,18941.99,863.56,,1,3,8,3,3
2,M,single,28.75,0,manuf,professional,14232.37,30013.32,3231.14,B,1,2,6,1,0
3,F,married,35.71,0,education,postgrad,3214.93,15423.24,1996.09,,0,1,2,0,0
4,M,single,20.53,0,construct,tertiary,3214.93,15423.24,1996.09,,1,2,1,3,0


In [9]:
label_mappings

{'sex': array(['F', 'M'], dtype=object),
 'mstatus': array(['divorced', 'married', 'single', 'widowed'], dtype=object),
 'occupation': array(['IT', 'construct', 'education', 'finance', 'government', 'legal',
        'manuf', 'medicine', 'retired'], dtype=object),
 'education': array(['postgrad', 'professional', 'secondary', 'tertiary'], dtype=object),
 'children': array(['0', '1', '2', '3', '4'], dtype=object)}

In [10]:
label_encoders

{'sex': LabelEncoder(),
 'mstatus': LabelEncoder(),
 'occupation': LabelEncoder(),
 'education': LabelEncoder(),
 'children': LabelEncoder()}

### Add new columns for the different types of values for the categorical columns

In [11]:
for categorical_feature in categorical_features:
    for class_value in label_mappings[categorical_feature]:
        df[categorical_feature + "_" + (class_value)] = df[categorical_feature] == np.array([(class_value)] * df.shape[0])
        df[categorical_feature + "_" + (class_value)] = df[categorical_feature + "_" + (class_value)].astype(int)

df.head()

Unnamed: 0,sex,mstatus,age,children,occupation,education,income,avbal,avtrans,decision,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
0,F,married,56.82,1,legal,secondary,3105.39,33003.48,1776.81,,...,0,0,0,1,0,0,1,0,0,0
1,M,widowed,87.35,3,retired,tertiary,4874.08,18941.99,863.56,,...,1,0,0,0,1,0,0,0,1,0
2,M,single,28.75,0,manuf,professional,14232.37,30013.32,3231.14,B,...,0,0,1,0,0,1,0,0,0,0
3,F,married,35.71,0,education,postgrad,3214.93,15423.24,1996.09,,...,0,1,0,0,0,1,0,0,0,0
4,M,single,20.53,0,construct,tertiary,3214.93,15423.24,1996.09,,...,0,0,0,0,1,1,0,0,0,0


### Insert the data into train and test 

In [12]:
# drop the categorical values
df = df.drop(categorical_features, axis=1)
train_df, test_df = train_test_split(df, test_size=0.15, random_state=RANDOM_STATE, stratify=df["decision"])
train_df.head()

Unnamed: 0,age,income,avbal,avtrans,decision,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
412,40.0,10653.49,30033.14,4103.15,,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
887,65.52,4009.23,20013.48,1811.16,,0,1,8,0,2,...,1,1,0,0,0,0,0,1,0,0
478,42.64,6406.75,39372.53,1783.25,,1,0,4,3,2,...,0,0,0,0,1,0,0,1,0,0
941,44.92,3135.45,12064.45,3092.13,,0,1,2,0,1,...,0,1,0,0,0,0,1,0,0,0
38,28.0,9956.44,30937.8,3866.72,,1,2,5,3,0,...,0,0,0,0,1,1,0,0,0,0


In [13]:
test_df.head()

Unnamed: 0,age,income,avbal,avtrans,decision,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
491,57.09,12961.14,29922.37,1981.44,,1,0,3,0,1,...,0,1,0,0,0,0,1,0,0,0
471,58.66,5006.93,10404.44,1448.95,,1,3,4,3,0,...,0,0,0,0,1,1,0,0,0,0
688,40.17,1102.49,15841.46,1209.34,A,0,1,3,2,0,...,0,0,0,1,0,1,0,0,0,0
200,35.82,11144.99,33709.7,7672.02,A,0,1,3,1,1,...,0,0,1,0,0,0,1,0,0,0
603,36.61,1534.41,12415.32,804.11,,0,0,4,2,1,...,0,0,0,1,0,0,1,0,0,0


### Save the data into CSVs for modelling

In [14]:
train_df.to_csv("working_data/trial_promo_training.csv", index = False)
test_df.to_csv("working_data/trial_promo_testing.csv", index = False)

### Part 2 - Insert synthetic data using SMOTE

In [15]:
X_train = train_df.loc[:, train_df.columns != "decision"]
X_train_columns_after_one_hot = X_train.columns.values
y_train = train_df["decision"]
X_train.head()

Unnamed: 0,age,income,avbal,avtrans,sexNum,mstatusNum,occupationNum,educationNum,childrenNum,sex_F,...,occupation_retired,education_postgrad,education_professional,education_secondary,education_tertiary,children_0,children_1,children_2,children_3,children_4
412,40.0,10653.49,30033.14,4103.15,0,1,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
887,65.52,4009.23,20013.48,1811.16,0,1,8,0,2,1,...,1,1,0,0,0,0,0,1,0,0
478,42.64,6406.75,39372.53,1783.25,1,0,4,3,2,0,...,0,0,0,0,1,0,0,1,0,0
941,44.92,3135.45,12064.45,3092.13,0,1,2,0,1,1,...,0,1,0,0,0,0,1,0,0,0
38,28.0,9956.44,30937.8,3866.72,1,2,5,3,0,0,...,0,0,0,0,1,1,0,0,0,0


### Run the SMOTE algorithm to balance the output classes as mentioned [here](http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.over_sampling.SMOTE.html)

In [16]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X_train, y_train)

In [17]:
print('Resampled dataset shape {}'.format(Counter(y_res)))

Resampled dataset shape Counter({'None': 704, 'A': 704, 'B': 704})


### So SMOTE in Python cannot yet handle categorical data - refer [this](https://stackoverflow.com/questions/47655813/oversampling-smote-for-binary-and-categorical-data-in-python)

In [18]:
X_res

array([[4.00000000e+01, 1.06534900e+04, 3.00331400e+04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.55200000e+01, 4.00923000e+03, 2.00134800e+04, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.26400000e+01, 6.40675000e+03, 3.93725300e+04, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.42149006e+01, 5.72327125e+03, 2.38781750e+04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.42384568e+01, 9.28981946e+03, 2.65852588e+04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.68159064e+01, 5.27767825e+03, 2.95854168e+04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [19]:
columns_to_round = []
sampled_data = ",".join(np.append(X_train_columns_after_one_hot, "decision")) + "\n" 
    
for itr in range(0, len(X_res)):
    for jtr in range(0, len(X_res[0])):
        if jtr in columns_to_round:
            sampled_data += str(round(X_res[itr][jtr])) + ","
        else:
            sampled_data += str(X_res[itr][jtr]) + ","
    sampled_data += str(y_res[itr]) + "\n"
    
with open("working_data/trial_promo_training_smote.csv", "w") as fw:
    fw.writelines(sampled_data)
    

In [20]:
test_df.to_csv("working_data/trial_promo_testing_smote.csv", index = False)