In [3]:
import os
import numpy as np
import pandas as pd
import feather
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from pathlib import Path

In [4]:
#Reset Working Directory
# os.chdir("c:\\Users\\Andrew\\Documents\\Uni Trier\\Semester 3\\Case Study\\ML Algo")

In [5]:
#Set top level directory, path for reading feather and output file for the results
top_wd = os.getcwd()
feather_dir = top_wd + "\\Feather"
os.makedirs(top_wd + "\\Results", exist_ok=True)
results_dir = top_wd + "\\Results"
holdout_dir = str(Path(top_wd).parents[0]) + "\\AMELIA\\AMELIA_P_level_v0.2.3 (Person-Level)"

In [6]:
#Function for numeric conversion of sex variable
def to_numeric(dataframe):
    sex = {"Male":0,"Female":1}
    dataframe = dataframe.replace({"Sex": sex})
    return dataframe

In [None]:
#Function to hotcode all catagorical variables
def cata_encode(dataframe):
    """Since get_dummies has not dropped the extra column, it must be done explicitly (Avoids Multicolinearity)"""
    #Cities will be dropped since there are too many and they are not considered relevant for our analysis
    #4: Other inactive person 
    Work_Columns = {"Work_Status_1.0":"At Work", "Work_Status_2.0":"Unemployed", "Work_Status_3.0":"Retired"}
    #5: ISCED5 or ISCED6
    Highest_ISCED = {"Highest_ISCED_1.0":"ISCED 1", "Highest_ISCED_2.0":"ISCED 2", "Highest_ISCED_3.0":"ISCED 3", "Highest_ISCED_4.0":"ISCED 4"}
    #5: Divorced
    Martial_Status = {"Martial_Status_1.0":"Never Married", "Martial_Status_2.0":"Married","Martial_Status_3.0":"Separated","Martial_Status_4.0":"Widowed"}
    #4: Region 4
    Region_ID = {"Regional_ID_1":"Region_1", "Regional_ID_2":"Region_2", "Regional_ID_3":"Region_3"}
    #11: Province 11
    #40: District 40

    dataframe = pd.get_dummies(dataframe, columns=['Work_Status', "Highest_ISCED", "Martial_Status", "Regional_ID", "Province", "District"]).rename(columns=Work_Columns).rename(columns=Highest_ISCED).rename(columns=Martial_Status).rename(columns=Region_ID).drop(columns=["Work_Status_4.0", "Highest_ISCED_5.0", "Martial_Status_5.0", "City.Community", "Regional_ID_4", "Province_11", "District_40"])
    return dataframe

In [7]:
#Store Holdout set for testing
os.chdir(holdout_dir)
Holdout = feather.read_dataframe("Holdout.feather") 
Holdout_y = Holdout["Person_Income"]
Holdout_x = Holdout.drop(columns = ["index", "Person_Income", "Personal_ID"])

sc = StandardScaler()
Holdout_unscaled = Holdout_x
Holdout_x = to_numeric(Holdout_x)
Holdout_x = cata_encode(Holdout_x)
Holdout_x = sc.fit_transform(Holdout_x)
Holdout_x = pd.DataFrame(Holdout_x, columns=Holdout_unscaled.columns)

In [11]:
S_Methods = ["SRS", "Stratified", "Cluster"]
#Create a dictionary for accesing all dataframes
df_dict = {}
for j in S_Methods:
    for i in range(10):
        os.chdir(feather_dir)
        SRS = feather.read_dataframe(f"0{i+1}_{j}_base_sample.feather") 
        Importance = feather.read_dataframe(f"0{i+1}_{j}_importance_sample.feather") 
        Synthetic = feather.read_dataframe(f"0{i+1}_{j}_synthetic_sample.feather")

        #Apply numeric conversion (Male:1, Female:2)
        SRS = to_numeric(SRS)
        SRS = cata_encode(SRS)
        Importance = to_numeric(Importance)
        Importance = cata_encode(Importance)
        Synthetic = to_numeric(Synthetic)
        Synthetic = cata_encode(Importance)

        df_dict[f"{j}_Base_{i+1}"] = SRS
        df_dict[f"{j}_Importance_{i+1}"] = Importance
        df_dict[f"{j}_Synthetic_{i+1}"] = Synthetic

In [12]:
#Create a function for handeling train test split
def t_t_split(dataframe):
    #Split x and y vars (also drop personal ID identifier and index)
    x_var = dataframe.iloc[:,1:].drop(columns = ["Personal_ID", "index"])
    y_var = dataframe.iloc[:,0]

    #Creat Train/Test split for x and y
    x_train, x_test, y_train, y_test = train_test_split(x_var, y_var, test_size = 0.25, random_state = 420)

    #Keep a copy of unscaled x_test for later comparison
    x_test_unscaled = x_test

    #Scale x var for train and test
    sc = StandardScaler()
    #Done now for convience regarding column reassignment(rather than above when x_var created)
    x_train = sc.fit_transform(x_train)
    x_test = sc.fit_transform(x_test)
    x_train = pd.DataFrame(x_train, columns=x_var.columns)
    x_test = pd.DataFrame(x_test, columns=x_var.columns)

    return x_train, x_test, y_train, y_test, x_test_unscaled

In [106]:
#Loop through Sampling methods and run our NN on each one, storing the score and results
def results(df_dict):
   S_Methods = ["SRS", "Stratified", "Cluster"]
   s_names = ["Base", "Importance", "Synthetic"]

   scores = pd.DataFrame(columns = s_names)
   scores_holdout = pd.DataFrame(columns = s_names)
   predicts_dict = {}
   holdout_p_dict = {}

   for S_Method in S_Methods:
      for Method in s_names:
         for i in range(10):   
            x_train, x_test, y_train, y_test, x_test_unscaled = t_t_split(df_dict[f"{S_Method}_{Method}_{i+1}"])

            #Run Multi-layer Perceptron regressor
            regr = MLPRegressor(random_state=420, max_iter=500, activation = "relu", solver='lbfgs').fit(x_train, y_train)
            
            """Write a scoring and predictions function for holdout set"""
            #Generate Score Value
            s_v = regr.score(x_test, y_test)
            h_s_v = regr.score(Holdout_x, Holdout_y)

            #Add score to scores DataFrame
            if S_Method == "SRS"
               if Method == "Base":
                  scores = scores.append({f"{S_Method}_{Method}":s_v}, ignore_index=True)
                  scores_holdout = scores_holdout.append({f"{S_Method}_{Method}":h_s_v}, ignore_index=True)

            #Once SRS Base is done, scores are updated with this function (To avoid indexing errors) 
            if S_Method != "SRS" | Method != "Base":
               scores.loc[i,f"{S_Method}_{Method}"] = s_v
               scores_holdout.loc[i,f"{S_Method}_{Method}"] = h_s_v

            #Generate Predictions
            predictions = regr.predict(x_test)
            holdout_p = regr.predict(Holdout_x)
            
            #Replace negative outputs with 0
            predictions = np.where(predictions < 0, 0, predictions)
            holdout_p = np.where(holdout_p < 0, 0, holdout_p)

            #Convert Predicitons from Ndarray to Dataframe for Concat
            predictions = pd.DataFrame(data=predictions, columns=["Predictions"])
            holdout_p = pd.DataFrame(data=holdout_p, columns=["Predictions"])
            
            #Store Y_test, Predictions for X_test and data for X_test (For Comparision)
            fused_df = pd.concat([y_test.reset_index(drop=True), predictions, x_test_unscaled.reset_index(drop=True)], axis = 1)
            fused_h_df = pd.concat([Holdout_y.reset_index(drop=True), holdout_p, Holdout_unscaled.reset_index(drop=True)], axis = 1)
            predicts_dict[f"{S_Method}_{Method}_{i+1}"] = fused_df
            holdout_p_dict[f"{S_Method}_{Method}_{i+1}"] = fused_h_df
                  
   return scores, predicts_dict, scores_holdout, holdout_p_dict

In [None]:
#Runtime for SRS: 49 minutes, 25 seconds
scores, predicts_dict, scores_holdout, holdout_p_dict = results(df_dict)

In [113]:
#This has already been referenced previously but here just in case its missed
S_Methods = ["SRS", "Stratified", "Cluster"]
os.chdir(results_dir)
scores.index += 1
scores_holdout.index += 1
feather.write_dataframe(scores,"scores.feather")
feather.write_dataframe(scores_holdout,"scores_holdout.feather")
for j in S_Methods:
    for i in range(10):
        feather.write_dataframe(predicts_dict[f"{j}_Base_{i+1}"],f"0{i+1}_SRS_MLPR_results.feather") 
        feather.write_dataframe(holdout_p_dict[f"{j}_Base_{i+1}"],f"0{i+1}_SRS_Holdout_results.feather") 
        feather.write_dataframe(predicts_dict[f"{j}_Importance_{i+1}"],f"0{i+1}_Importance_MLPR_results.feather") 
        feather.write_dataframe(holdout_p_dict[f"{j}_Importance_{i+1}"],f"0{i+1}_Importance_Holdout_results.feather") 
        feather.write_dataframe(predicts_dict[f"{j}_Synthetic_{i+1}"],f"0{i+1}_Synthetic_MLPR_results.feather")
        feather.write_dataframe(holdout_p_dict[f"{j}_Synthetic_{i+1}"],f"0{i+1}_Synthetic_Holdout_results.feather")