In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split as tts
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.datasets import fetch_california_housing
import random
random.seed(1)
from IPython.display import clear_output
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import VarianceThreshold
from scipy import stats
from sklearn.neighbors import LocalOutlierFactor


<h1>Load ML/Metric module</h1>

In [2]:
def lr_get_mse(df, target):
    try:
        xtrain, xtest, ytrain, ytest = tts(df.loc[:, df.columns != target], df.loc[:, target], test_size=0.3, random_state=69)
        regr = LinearRegression()
        regr.fit(xtrain, ytrain)
        ypred = regr.predict(xtest)
        error = mse(ytest, ypred, squared=True) #actually RMSE here, not MSE
        return df, error
    except:
        return df, float('inf')

<h1>Load PreProc modules</h1>

In [3]:
#imputation
def apply_Simple_Imputation(df, config={}):

    strategy = config['strategy']
    missing_values = config['missing_values']

    imputer = SimpleImputer(strategy=strategy, missing_values=missing_values)
    imputed_arr = imputer.fit_transform(df)
    imputed_df = pd.DataFrame(imputed_arr, columns=df.columns)

    return imputed_df

def apply_most_frequent_value_imputer(df, target, config={}, eval_method=lr_get_mse):
    config['strategy'] = 'most_frequent'

    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    config['missing_values'] = np.nan
    imputed_numeric_columns = apply_Simple_Imputation(numeric_columns, config)
    if len(categorical_columns.columns) > 0:
        config['missing_values'] = "NaN"
        imputed_categorical_columns = apply_Simple_Imputation(categorical_columns, config)
    else:
        imputed_categorical_columns = categorical_columns

    processed_dataset = pd.concat([imputed_numeric_columns, imputed_categorical_columns], axis=1)

    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')

def apply_mean_imputer(df, target, config={}, eval_method=lr_get_mse):
    config['strategy'] = 'mean'

    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    config['missing_values'] = np.nan
    imputed_numeric_columns = apply_Simple_Imputation(numeric_columns, config)

    processed_dataset = pd.concat([imputed_numeric_columns, categorical_columns], axis=1)

    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')

def apply_median_imputer(df, target, config={}, eval_method=lr_get_mse):
    config['strategy'] = 'median'

    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    config['missing_values'] = np.nan
    imputed_numeric_columns = apply_Simple_Imputation(numeric_columns, config)

    processed_dataset = pd.concat([imputed_numeric_columns, categorical_columns], axis=1)

    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')
    
#normalization
def apply_Z_Score_Normalization(df, target, config={}, eval_method=lr_get_mse):

    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    target_column = numeric_columns.loc[:, target]
    nontarget_columns = numeric_columns.loc[:, numeric_columns.columns != target]
    normalizer = StandardScaler().fit(nontarget_columns)
    normalized_arr = normalizer.transform(nontarget_columns)
    normalized_df = pd.DataFrame(normalized_arr, columns=nontarget_columns.columns)

    processed_dataset = pd.concat([normalized_df, categorical_columns, target_column], axis=1)

    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')
    
def apply_Min_Max_Normalization(df, target, config={}, eval_method=lr_get_mse):

    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    target_column = numeric_columns.loc[:, target]
    nontarget_columns = numeric_columns.loc[:, numeric_columns.columns != target]

    normalizer = MinMaxScaler().fit(nontarget_columns)
    normalized_arr = normalizer.transform(nontarget_columns)
    normalized_df = pd.DataFrame(normalized_arr, columns=nontarget_columns.columns)

    processed_dataset = pd.concat([normalized_df, categorical_columns, target_column], axis=1)

    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')

def apply_Quantile_Normalization(df, target, config={ "n_quantiles": 10, "random_state": 0 }, eval_method=lr_get_mse):

    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    target_column = numeric_columns.loc[:, target]
    nontarget_columns = numeric_columns.loc[:, numeric_columns.columns != target]

    n_quantiles = config["n_quantiles"]
    random_state = config["random_state"]

    normalizer = QuantileTransformer(n_quantiles=n_quantiles, random_state=random_state)
    normalized_arr = normalizer.fit_transform(nontarget_columns)
    normalized_df = pd.DataFrame(normalized_arr, columns=nontarget_columns.columns)

    processed_dataset = pd.concat([normalized_df, categorical_columns, target_column], axis=1)

    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')
    
def apply_Missing_Ratio_Feature_Selection(df, target, config={ "threshold": 0.2 }, eval_method=lr_get_mse):

    target_column = df.loc[:, target]
    nontarget_columns = df.loc[:, df.columns != target]

    missing_series = nontarget_columns.isnull().sum() / nontarget_columns.shape[0]

    missing_stats = pd.DataFrame(missing_series).rename(
        columns={'index': 'feature', 0: 'missing_fraction'})

    # Sort with highest number of missing values on top
    missing_stats = missing_stats.sort_values('missing_fraction', ascending=False)

    # Find the columns with a missing percentage above the threshold
    record_missing = pd.DataFrame(missing_series[missing_series >
                                  config['threshold']]).reset_index().\
        rename(columns={'index': 'feature', 0: 'missing_fraction'})

    to_drop = list(record_missing['feature'])
    to_keep = set(nontarget_columns.columns) - set(to_drop)

    processed_dataset = pd.concat([nontarget_columns[list(to_keep)], target_column], axis=1)
    
    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')
    
#feature selection
def apply_K_Best_Feature_Selection(df, target, config={ "k": 5 }, eval_method=lr_get_mse):
    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    target_column = numeric_columns.loc[:, target]
    nontarget_columns = numeric_columns.loc[:, numeric_columns.columns != target]

    # Extract +ve columns
    lsv = list(nontarget_columns.lt(0).sum().values)
    lis = list(nontarget_columns.lt(0).sum().index)
    to_remove = []
    for i in range(0, len(lsv)):
        if lsv[i] > 0:
            to_remove.append(lis[i])
    lis = list(filter(lambda x : x not in to_remove, lis))
    
    selection = nontarget_columns
    if len(lis) > 0:
        filtered_df = nontarget_columns[lis]
        try:
            selector = SelectKBest(chi2, k=config["k"])
            selector.fit(filtered_df, target_column)
                                                    
            cols = selector.get_support(indices=True)
        except:
            return df, float('inf')
        selection = filtered_df.iloc[:,cols]
    
    processed_dataset = pd.concat([selection, categorical_columns, target_column], axis = 1)
    
    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')

def apply_Variance_Based_Feature_Selection(df, target, config={ "threshold": 0 }, eval_method=lr_get_mse):
    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    target_column = numeric_columns.loc[:, target]
    nontarget_columns = numeric_columns.loc[:, numeric_columns.columns != target]
    
    selector = VarianceThreshold(threshold=config["threshold"])
    try:
        selector.fit(nontarget_columns, target_column)
        cols = selector.get_support(indices=True)
        selection = nontarget_columns.iloc[:,cols]
    except Exception as e:
        selection = nontarget_columns
    
    processed_dataset = pd.concat([selection, categorical_columns, target_column], axis = 1)
    
    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')

#outlier detection
def apply_MAD_Score_Based_Outlier_Detection(df, target, config={ "threshold": 3.0, "ratio": 0.3 }, eval_method=lr_get_mse):
    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    target_column = numeric_columns.loc[:, target]
    nontarget_columns = numeric_columns.loc[:, numeric_columns.columns != target]
    
    if df.isnull().values.any() > 0:
        return df, float('inf')
    
    median = nontarget_columns.apply(np.median, axis=0)

    # median_absolute_deviation = 1.4296 * \
    #     np.abs(nontarget_columns - median).apply(np.median, axis=0)
    median_absolute_deviation = stats.median_abs_deviation(nontarget_columns, scale=1)

    modified_z_scores = (nontarget_columns - median) / median_absolute_deviation

    outliers = nontarget_columns[np.abs(modified_z_scores) > config["threshold"]]

    to_drop = outliers[(outliers.count(axis=1) /
                        outliers.shape[1]) > config["ratio"]].index

    to_keep = set(nontarget_columns.index) - set(to_drop)
    
    if (config["ratio"] == -1):
        filtered_df = nontarget_columns[~(np.abs(modified_z_scores) > config["threshold"]).any(axis=1)]
    else:
        filtered_df = nontarget_columns.loc[list(to_keep)]
        
    processed_dataset = pd.concat([filtered_df, categorical_columns, target_column], axis = 1)
    processed_dataset.dropna(inplace=True)
    
    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')

def apply_Inter_Quantile_Range_Outlier_Detection(df, target, config={ "ratio": 0.3 }, eval_method=lr_get_mse):
    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    target_column = numeric_columns.loc[:, target]
    nontarget_columns = numeric_columns.loc[:, numeric_columns.columns != target]
    
    if nontarget_columns.isnull().values.any() > 0:
        return df, float('inf')
    
    Q1 = nontarget_columns.quantile(0.25)
    Q3 = nontarget_columns.quantile(0.75)
    
    inter_quatile_range = Q3 - Q1

    outliers = nontarget_columns[((nontarget_columns < (Q1 - 1.5 * inter_quatile_range)) |\
                                  (nontarget_columns > (Q3 + 1.5 * inter_quatile_range)))]

    to_drop = outliers[(outliers.sum(axis=1) /
                        outliers.shape[1]) > config["ratio"]].index


    to_keep = set(nontarget_columns.index) - set(to_drop)
    
    if (config["ratio"] == -1):
        filtered_df = nontarget_columns[~(filter_criteria).any(axis=1)]
    else:
        filtered_df = nontarget_columns.loc[list(to_keep)]
        
    processed_dataset = pd.concat([filtered_df, categorical_columns, target_column], axis = 1)
    processed_dataset.dropna(inplace=True)
    
    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')
    
def apply_Local_Factor_Outlier_Detection(df, target, config={ "n_neighbors": 4, "contamination": 0.1, "threshold": 30 }, eval_method=lr_get_mse):
    numeric_columns, categorical_columns = get_numeric_categorical_columns(df)
    target_column = numeric_columns.loc[:, target]
    nontarget_columns = numeric_columns.loc[:, numeric_columns.columns != target]
    
    if nontarget_columns.isnull().values.any() > 0:
        return df, float('inf')
    
    clf = LocalOutlierFactor(n_neighbors=4, contamination=0.1)
    clf.fit_predict(nontarget_columns)
    
    LOF_scores = clf.negative_outlier_factor_
    
    k = config["threshold"]
    top_k_idx = np.argsort(LOF_scores)[-k:]
    top_k_values = [LOF_scores[i] for i in top_k_idx]

    filtered_df = nontarget_columns[LOF_scores < top_k_values[0]]
        
    processed_dataset = pd.concat([filtered_df, categorical_columns, target_column], axis = 1)
    processed_dataset.dropna(inplace=True)
    
    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')

def apply_Exact_Duplicate_Detection(df, target, config={}, eval_method=lr_get_mse):
    processed_dataset = df.drop_duplicates()
    
    try:
        dataframe, mse = eval_method(processed_dataset, target)
        return processed_dataset, mse
    except:
        return processed_dataset, float('inf')

configs = {
    'KNN': { "n_neighbors": 2, "weights": "uniform" },
    "MFV": {},
    "MEA": {},
    "MED": {},
    "ZSC": {},
    "MMN": {},
    "QDS": { "n_quantiles": 10, "random_state": 0 },
}

<h1>Load DataFrame</h1>

In [4]:
df, target = fetch_california_housing(return_X_y = True, as_frame = True)
df['target'] = target

In [5]:
for col in df.columns:
    df.loc[df.sample(frac=0.2).index, col] = np.nan
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,3.867251,28.650012,5.43833,1.096746,1423.570736,3.103954,35.633244,-119.573831,2.068236
std,1.905708,12.583574,2.606967,0.486248,1118.596895,11.603814,2.137922,2.004813,1.15207
min,0.4999,1.0,0.888889,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.559725,18.0,4.442904,1.005913,786.75,2.428016,33.93,-121.8,1.19675
50%,3.5234,29.0,5.228085,1.048827,1165.0,2.816651,34.26,-118.5,1.805
75%,4.739025,37.0,6.051724,1.099695,1724.0,3.281287,37.72,-118.01,2.649
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


<h1>Define Util functions</h1>

In [6]:
def get_numeric_categorical_columns(df):
    categorical_columns = []
    numeric_columns = []
    for col in df.columns:
        if df[col].map(type).eq(str).any():
            categorical_columns.append(col)
        else:
            numeric_columns.append(col)
    return pd.DataFrame(df[numeric_columns]), pd.DataFrame(df[categorical_columns])
numeric_columns, categorical_columns = get_numeric_categorical_columns(df)

def epsilon(choices, q_vals, e):
    num = random.randrange(0,100)/100
    if(e >= num):
        max_q = max(q_vals)
        max_indices = [i for i, val, in enumerate(q_vals) if val == max_q]
        if len(max_indices) > 1:
            choice = random.randrange(0,len(max_indices))
            index = max_indices[choice]
        else:
            index = max_indices[0]
        return choices[index]
    else:
        index = random.randrange(0, len(choices))
        return choices[index]

<h1>Define PreProcMachine</h1>

In [7]:
def preprocmachine (df, target, goal = 'LinReg', gamma = 0.8):

    numExpl = 300
    
    start = [
        lr_get_mse
    ]
    imputation = [
        #apply_KNN_Imputation,
        apply_most_frequent_value_imputer, 
        apply_mean_imputer, 
        apply_median_imputer, 
    ]
    outlier_detection = [
        apply_MAD_Score_Based_Outlier_Detection,
        #apply_Inter_Quantile_Range_Outlier_Detection,
        apply_Local_Factor_Outlier_Detection
    ]
    normalization = [
        apply_Z_Score_Normalization, 
        apply_Min_Max_Normalization, 
        apply_Quantile_Normalization, 
    ]
    feature_selection = [
        apply_Missing_Ratio_Feature_Selection,
        apply_K_Best_Feature_Selection,
        apply_Variance_Based_Feature_Selection,
    ]
    goal = [
        lr_get_mse
    ]
    
    statefuncs = start + imputation + outlier_detection + normalization + feature_selection + goal
    statenames = []
    for i in statefuncs:
        statenames.append(i.__name__)
        
    #generate r table
    threshold1 = len(start) - 1
    threshold2 = threshold1 + len(imputation)
    threshold3 = threshold2 + len(outlier_detection)
    threshold4 = threshold3 + len(normalization)
    threshold5 = threshold4 + len(feature_selection)
    threshold6 = threshold5 + len(goal)

    r_table = []
    for i in range(len(statefuncs)):
        temp = []
        for j in range(len(statefuncs)):
            if i == len(statefuncs) - 1:
                temp.append(0)
            elif j == len(statefuncs) - 1:
                temp.append(1)
            elif (i > threshold5 and j <= threshold6) or (i > threshold4 and j <= threshold5) or (i > threshold3 and j <= threshold4) or (i > threshold2 and j <= threshold3) or (i > threshold1 and j <= threshold2) or i == j:
                temp.append(-1)
            else:
                temp.append(0)
        r_table.append(temp)
        
#     initialize Q matrix
    
    q_table = []
    for i in range(len(statefuncs)):
        temp = []
        for j in range(len(statefuncs)):
            temp.append(0)
        q_table.append(temp)

#     establish possible entry points
    entry_range = 0
    goal_state = len(statefuncs) - 1

    safecopy = pd.DataFrame.copy(df)
    
    for i in range(numExpl):
        
        clear_output(wait=False)
        print("Starting exploration", i + 1 , "/", numExpl)
        state = 0
        route = [] # will store state history for this exploration
        total_q = 0
        route.append(state)
        currExploreDF = pd.DataFrame.copy(df)
        currExploreDF, currExploreError = statefuncs[state](currExploreDF, target)
        
        while state != goal_state:
            possible_states = [i for i, val in enumerate(r_table[state]) if val >= 0]
            possible_qs = [q_table[state][i] for i in possible_states]
            prob = (i/numExpl) * 0.7
            next_state = epsilon(possible_states, possible_qs, prob)
            print("Moved from", state, "-->", next_state)
            
            #apply new state's proc to current df
            currExploreDF, newExploreError = statefuncs[next_state](currExploreDF, target)

            if currExploreError == float('inf') and newExploreError == float('inf'):
                newReward = -1
            elif currExploreError == float('inf'):
                newReward = 1
            elif newExploreError == float('inf'):
                newReward = -2
            else:
                newReward = ((currExploreError - newExploreError) / currExploreError) * 100
            reward = newReward
            currExploreError = newExploreError
            
            #find max Q value out of all possible actions from new state
            possible_states = [i for i, val in enumerate(r_table[next_state]) if val >= 0]
            possible_qs = [q_table[next_state][i] for i in possible_states]
            max_q = max(possible_qs)
            
            #calculate new Q value for this state
            q_table[state][next_state] = reward + gamma * max_q + r_table[state][next_state]
            
            total_q += q_table[state][next_state]
            
            #move to next state
            state = next_state
            
            #record path
            route.append(state)
    
    print("Current Q table")
    for i in range(0, len(q_table)):
        print(q_table[i])
        
    # find best route
    best_route = []
    q_vals = []
    state = 0
    q = 0
    route.append(state)
    while state != len(statefuncs) - 1:
        possible_states = [i for i, val in enumerate(r_table[state]) if val >= 0]
        possible_qs = [q_table[state][i] for i in possible_states]
        #choose highest q and get corresponding state
        max_q = max(possible_qs)
        q_vals.append(max_q)
        q += max_q
        index = possible_qs.index(max_q)
        next_state = possible_states[index]
        best_route.append(next_state)
        state = next_state
    
    #print route in names
    route_names = []
    for i in best_route:
        route_names.append(statenames[i])
    print(best_route)
    print(route_names)
    print(q_vals)
        
    # run best route
    df1, initialerror = lr_get_mse(safecopy, 'target')
    df2 = pd.DataFrame.copy(safecopy)
    finalerror = 0
    for i in range(0, len(best_route) - 1):
        df2, finalerror = statefuncs[best_route[i]](df2, target)
    print("MSE:", initialerror, "-->", finalerror)
    print("Diff:", initialerror - finalerror)
    return df2

<h1>Run PreProcMachine</h1>

In [8]:
procd_dataframe = preprocmachine(df, 'target')

Starting exploration 300 / 300
Moved from 0 --> 11
Moved from 11 --> 12
Current Q table
[0, 2.7158584896984728, 4.63578532473864, 11.946395015691262, -0.19999999999999996, -1.0, -1.0, -1.0, -0.19999999999999996, -1.0, 0.6000000000000001, -0.19999999999999996, 0.0]
[0, 0, 0, 0, -1.5733499892467788, 2.144823112123091, 1.2800000000000002, 0.0, 0, 0, -1.2, 0, 1.0]
[0, 0, 0, 0, 4.5447316559233, 0, 1.2800000000000002, 0.0, -4.804920762096151, 0, 0, 0.0, 0]
[0, 0, 0, 0, 12.882993769614076, 2.128399088408129, 0.0, 0.8, -2.1227010704456646, 4.5899870635425064e-14, -0.3999999999999999, 0.8, 1.0]
[0, 0, 0, 0, 0, 0, -2.0, -1.0, -2.0, -1.7562537488964448e-13, -0.3999999999999999, 0.0, 0.0]
[0, 0, 0, 0, 0, 0, -2.0, -0.3599999999999999, -1.0, -0.19999999999999996, -1.0, -0.19999999999999996, 1.0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, -1.0, -1.0, -1.0, 0.0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, -1.0, -1.0, -0.19999999999999996, 1.0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, -0.19999999999999996, -1.0, -0.19999999999999996, 0.0]
[0, 0,

In [9]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,,1.02381,322.0,2.555556,37.88,-122.23,
1,,21.0,6.238137,0.97188,,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,,,2.80226,37.85,,3.521
3,5.6431,,5.817352,1.073059,558.0,2.547945,,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,,37.85,-122.25,3.422


In [10]:
procd_dataframe.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
1,3.5234,21.0,6.238137,0.97188,1165.0,2.109842,37.86,-122.22,3.585
3,5.6431,29.0,5.817352,1.073059,558.0,2.547945,34.26,-122.25,3.413
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,34.26,-122.26,2.267
12,3.075,29.0,5.228085,1.012821,1165.0,2.346154,37.85,-122.26,2.135
14,3.5234,29.0,4.262903,1.048827,1165.0,1.954839,37.85,-122.26,1.592
