## Preprocessing - Encode, Normalize features ##

In [3]:
from sklearn import preprocessing
from tqdm import tqdm 
import pandas as pd

def generate_dummies(df, columns, prefixes=""):
    
    if len(columns)>0:
        assert ((len(prefixes)>0)), \
               "Prefixes must be mentioned for all dummies columns"
    
    index = 0
    for col in tqdm(columns):
        if len(prefixes)>0:
            temp = pd.get_dummies(df[col],prefix=prefixes[index])
        else:
            temp = pd.get_dummies(df[col])       
        df = pd.concat([df, temp], axis=1)
        index = index + 1
    return df    

def label_encode(values):
    # Label Encoding Categorical columns
    le = preprocessing.LabelEncoder()
    values = [ element if type(element) is str else '' for element in values]
    le.fit(values)        
    return le.transform(values)

def label_decode(values, value_to_decode):
    # Label Encoding Categorical columns
    le = preprocessing.LabelEncoder()    
    values = [ element if type(element) is str else '' for element in values]
    le.fit(values)
    return le.inverse_transform(value_to_decode)

def normalize(values):
    #Normalize using StandardScaler, setting Mean zero and Variance 1
    scaler = preprocessing.StandardScaler()
    scaler.fit(values)
    return scaler.fit_transform(df[col])
    
    
def preprocess(df, columns, dummies_cols = "", dummies_cols_prefix = ""):
    for col in tqdm(columns):
        
        # Processing for String columns
        if df[col].dtype == object: #For String columns
            
            # Replace NAs with ''
            df[col] = df[col].fillna('')
            
            # Label Encoding Categorical columns 
            df[col] = label_encode(df[col])
        
        # Processing for Numeric columns
        if (df[col].dtype == int) | (df[col].dtype == float):
                        
            df[col] = normalize(df[col])    
        
    
    if len(dummies_cols)>0:
            df = generate_dummies(df, dummies_cols, dummies_cols_prefix)
        
    return df

## Train-Test Split ##

In [2]:
from sklearn import model_selection
def traintestsplit(df, features_cols, target_col, test_perc = 0.2):
    return model_selection.train_test_split(df[features_cols], df[target_col], \
                                            test_size=0.33, random_state=42)

## Dummy Features ##

In [None]:
dummies_cols = ['COLNAME1','COLNAME2']
dummies_cols_prefix = dummies_cols

# List to contain names of dummies column generated with prefix
dummies_cols_prefixed = []
            
for index in range(0,len(dummies_cols)):
    for value in candidates_procsd_df[dummies_cols[index]].unique():
        dummies_cols_prefixed.append(dummies_cols_prefix[index] + '_' + value)

**Use dummies_cols_prefixed when passing all the dummy columns in the model**

*For example:*

In [None]:
procsd_df = preprocess(df, \
                      columns=features_cols, \
                      dummies_cols=dummies_cols, \
                      dummies_cols_prefix=dummies_cols_prefix)

In [None]:
def split_values_and_extract_column_names(df, source_col, delimiter = ";", prefix = source_col):
    column_values = []
    for value in df[source_col].unique():
        tmp_list = []
        value = str.strip(value)
        if value != '':
            tmp_list = value.split(';')
        column_values = column_values + tmp_list

    column_values = set(column_values)

    column_values_colnames = [prefix + str.upper(value) for value in column_values]
    
    return column_values_colnames
    
def add_empty_cols_to_df(df, column_names):   
    return pd.concat([df,pd.DataFrame(columns=column_names)], axis=1)