# Drug Classification ETL

This notebook contains the Extract Transform (and cleaning) Load workflow for the Drug Classification project.

In [None]:
# imports
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

os.environ["WANDB_API_KEY"] = "0" ## to silence warning

## Extract

Reads in data from csv, validates import.

In [None]:
fullDf = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')

## Transform / Cleaning

Correct data types, encodes categorical fields, resamples, and splits data as required... 

In [None]:
# Helper methods...

def encode_cat(series):
    """Encodes categorical series into numeric values.
    
    Args:
        series (Pandas.Series): The series to encode.
    
    Returns:
        a Tuple containing Pandas.Series of the encoded values and a mapping dictionary.
    """
    le = LabelEncoder()
    encoded=le.fit_transform(series)
    
    return encoded, get_integer_mapping(le)


def get_integer_mapping(le):
    '''Gets a dict mapping labels to their integer values from an SKlearn LabelEncoder.
    
    Args:
        le (LabelEncoder): A fitted label encoder.
        
    Returns:
        Mapping Dictionary.
    '''
    res = {}
    for cl in le.classes_:
        res.update({cl:le.transform([cl])[0]})

    return res


def get_key(mapping_dict, val):
    """Gets key for a given value in a dictionary
    
    Args:
        mapping_dict ({<category name>:<numeric value>}): The dictionary containing the category name and it's numeric repersentation.
        
        val (int): the desired value.
        
    Returns:
        The category name for that requested value.
    """
    for key, value in mapping_dict.items():
        if val == value:
            return key
        

def get_class_weights(series, class_map=None):
    """Calculates weights for each class.
    
    Args:
        series (Pandas.Series): The series containing the categories to compute.
        
        class_map (mapping dictionary)(optional): A mapping dict used to map the class names to their numeric value.
        
    Returns:
        {'<category>':'<weight>'}
    
    """
    
    class_weight = compute_class_weight('balanced', series.unique() , series)

    return_dict = {}
    
    for index, weight in enumerate(class_weight):
        if class_map == None:
            return_dict[index] = weight
        else:
            return_dict[get_key(class_map, index)] = weight
        
    return return_dict


def undersample_set(x_train_features, x_train_labels):
    """Undersamples dataset using NearMiss
    
    Args:
        x_train_features (numpy.array): Array containing the variables.
        
        x_train_labels (numpy.array): Array containing the labels for the set.
        
    Returns:
        Tuple containing the resampled variables and labels.
    """

    near = NearMiss(sampling_strategy="not minority")

    return near.fit_resample(x_train, y_train)


def oversample_set(x_train_features, x_train_labels):
    """Oversamples dataset using SMOTE
    
    Args:
        x_train_features (numpy.array): Array containing the variables.
        
        x_train_labels (numpy.array): Array containing the labels for the set.
        
    Returns:
        Tuple containing the resampled variables and labels.
    """

    smote = SMOTE(sampling_strategy='minority')

    # fit the object to our training data
    return smote.fit_resample(x_train_features, x_train_labels)


def resample_set(x_train_features, x_train_labels):
    """Oversamples and Undersamples dataset using SMOTETomek
    
    Args:
        x_train_features (numpy.array): Array containing the variables.
        
        x_train_labels (numpy.array): Array containing the labels for the set.
        
    Returns:
        Tuple containing the resampled variables and labels.
    """
    
    smotemek = SMOTETomek(sampling_strategy='auto')

    return smotemek.fit_resample(x_train, y_train)

In [None]:
# Fix types...
cat_vars = ['Sex', 'BP', 'Cholesterol', 'Drug']

for var in cat_vars:
    fullDf[var] = fullDf[var].astype('category')
    
# Encode...
enc_sex, sex_map = encode_cat(fullDf.Sex)
enc_cho, cho_map = encode_cat(fullDf.Cholesterol)
enc_BP, bp_map = encode_cat(fullDf.BP)
enc_drug, drug_map = encode_cat(fullDf.Drug)

# Join encoded values back to set...
#fullDf['enc_sex'] = enc_sex The model was overfitting slightly when sex was included in the set...
fullDf['enc_cho'] = enc_cho
fullDf['enc_BP'] = enc_BP
fullDf['enc_drug'] = enc_drug

# Drop unencoded columns...
fullDf.drop('Sex', axis=1, inplace=True)
fullDf.drop('Cholesterol', axis=1, inplace=True)
fullDf.drop('BP', axis=1, inplace=True)
fullDf.drop('Drug', axis=1, inplace=True)

# Split the sets... 
x_train, x_test, y_train, y_test = train_test_split(fullDf.loc[ : , fullDf.columns != 'enc_drug'], fullDf.enc_drug, test_size=0.33, random_state=42)

# resample
x_train_resampled, y_train_resampled = resample_set(x_train, y_train)

## Load

Writes training and testing facts to file.

In [None]:
# Encoded FullDf
fullDf.to_csv('/kaggle/working/Fact_Encoded_fullDf.csv')

# Imbalanced train/test feaures and labels
x_train.to_csv('/kaggle/working/Fact_imb_train_features.csv')
y_train.to_csv('/kaggle/working/Fact_imb_train_labels.csv')

x_test.to_csv('/kaggle/working/Fact_imb_test_features.csv')
y_test.to_csv('/kaggle/working/Fact_imb_test_labels.csv')

# Resampled train features and labels
x_train_resampled.to_csv('/kaggle/working/Fact_resampled_train_features.csv')
y_train_resampled.to_csv('/kaggle/working/Fact_resampled_train_labels.csv')