In [1]:
import pandas as pd
import numpy as np

In [1]:
def train_data_clean(file_csv, impute = None):
    # this function only works for APS project
    # impute has 4 options: median, mode, mean and zero
    symbol = '-1'
    df = pd.read_csv(file_csv) # load csv
    df = df.replace('na', symbol) # replace 'na' to '-1'
    df = df.drop('index', axis = 1) # drop useless columns
    df['class'] = df['class'].map({'neg': 0, 'pos': 1}) # one hot encoding
    # change all object to numerical
    cols = [col for col in df.columns.values if df[col].dtypes == 'object']
    df[cols] = df[cols].astype('float64')
    # split to positive, negative
    df_pos = df[df['class'] == 1].reset_index(drop = True)
    df_neg = df[df['class'] == 0].reset_index(drop = True)
    # imputation value
    if impute == 'median' or impute == 'Median':
        var_pos = df_pos[df_pos != int(symbol)].median(axis = 0).values
        var_neg = df_neg[df_neg != int(symbol)].median(axis = 0).values
    elif impute == 'mode' or impute == 'Mode':
        var_pos = df_pos[df_pos != int(symbol)].mode(axis = 0).iloc[0].values
        var_neg = df_neg[df_neg != int(symbol)].mode(axis = 0).iloc[0].values
    elif impute == 'mean' or impute == 'Mean':
        var_pos = df_pos[df_pos != int(symbol)].mean(axis = 0).values
        var_neg = df_neg[df_neg != int(symbol)].mean(axis = 0).values
    else:
        return df.replace(int(symbol), 0)
    # impute
    i = 1
    for col in df.columns[1:]:
        df_pos.loc[df_pos[col] == int(symbol), col] = var_pos[i]
        df_neg.loc[df_neg[col] == int(symbol), col] = var_neg[i]
        i += 1
    print('Nan: {}'.format(np.sum(df_pos.isnull().values)))
    print('Nan: {}'.format(np.sum(df_neg.isnull().values)))
    return df_pos.append(df_neg, ignore_index = True)

In [None]:
def test_data_clean(test_file_csv, train_file_csv, impute = None):
    # this function only works for APS project
    # impute has 4 options: median, mode, mean and zero
    # test dataset impute is based on train dataset
    symbol = '-1'
    df_train = pd.read_csv(train_file_csv) # load csv
    df_test  = pd.read_csv(test_file_csv)
    
    df_train = df_train.replace('na', symbol) # replace 'na' to '-1'
    df_test  = df_test.replace('na', symbol)
    
    df_train = df_train.drop('index', axis = 1) # drop useless columns
    df_test  = df_test.drop('index', axis = 1)
    
    # one hot encoding
    df_train['class']  = df_train['class'].map({'neg': 0, 'pos': 1})
    df_test['class']   = df_test['class'].map({'neg': 0, 'pos': 1})
    
    # change all object to numerical
    cols = [col for col in df_train.columns.values if df_train[col].dtypes == 'object']
    df_train[cols] = df_train[cols].astype('float64')
    df_test[cols]  = df_test[cols].astype('float64')

    # imputation value
    if impute == 'median' or impute == 'Median':
        var = df_train[df_train != int(symbol)].median(axis = 0).values
    elif impute == 'mode' or impute == 'Mode':
        var = df_train[df_train != int(symbol)].mode(axis = 0).iloc[0].values
    elif impute == 'mean' or impute == 'Mean':
        var = df_train[df_train != int(symbol)].mean(axis = 0).values
    else:
        return df_test.replace(int(symbol), 0)
    # impute
    i = 1
    for col in df_test.columns[1:]:
        df_test.loc[df_test[col] == int(symbol), col] = var[i]
        i += 1
    print('Nan: {}'.format(np.sum(df_test.isnull().values)))
    return df_test

In [None]:
def test_data_clean_v2(test_file_csv, cleaned_train_dataframe, impute = None):
    # this function only works for APS project
    # impute has 4 options: median, mode, mean and zero
    # test dataset impute is based on train dataset
    symbol = '-1'
    df_test  = pd.read_csv(test_file_csv) # load csv
    
    df_test  = df_test.replace('na', symbol) # replace 'na' to '-1'
    
    df_test  = df_test.drop('index', axis = 1) # drop useless columns
    
    # one hot encoding
    df_test['class']   = df_test['class'].map({'neg': 0, 'pos': 1})
    
    # change all object to numerical
    cols = [col for col in df_test.columns.values if df_test[col].dtypes == 'object']
    df_test[cols]  = df_test[cols].astype('float64')

    # imputation value
    if impute == 'median' or impute == 'Median':
        var = cleaned_train_dataframe.median(axis = 0).values
    elif impute == 'mode' or impute == 'Mode':
        var = cleaned_train_dataframe.mode(axis = 0).iloc[0].values
    elif impute == 'mean' or impute == 'Mean':
        var = cleaned_train_dataframe.mean(axis = 0).values
    else:
        return df_test.replace(int(symbol), 0)
    # impute
    i = 1
    for col in df_test.columns[1:]:
        df_test.loc[df_test[col] == int(symbol), col] = var[i]
        i += 1
    print('Nan: {}'.format(np.sum(df_test.isnull().values)))
    return df_test