In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [18]:
def one_hot_encoding(dataframe, features=[]):

    for feature in features:
        temp_dataframe = pd.get_dummies(dataframe[feature], prefix=feature)
        dataframe = pd.concat([dataframe, temp_dataframe], axis=1)
        dataframe_dash = dataframe
        
    dataframe = dataframe.drop(columns=features)

    return dataframe


def one_hot_decoding(original_dataframe_features=None, encoded_dataframe=None):
    encoded_list = list(encoded_dataframe)
    
    encoded_feature_list = [x.split('_')[-1] for x in encoded_list]
    encoded_feature_list = np.array(encoded_feature_list, dtype='float')
    
    output = encoded_dataframe.values.astype(float)
    output = np.multiply(output, encoded_feature_list)
    
    decode_df = pd.DataFrame(output, columns=encoded_list)

    for feature in original_dataframe_features:
        l = [x for x in encoded_list if feature in x]
        decode_df[feature] = decode_df[l].max(axis=1)

    return decode_df.drop(columns=encoded_list)

In [2]:
def fit_lasso_regression(dataframe, target=None, steps=100 ,l=0.02):
    
    #split the X and Y from the dataframe
    size = len(list(dataframe)) -1
    print('within lasso')
    X = np.nan_to_num(dataframe.iloc[:, dataframe.columns != target].values)
    Y = dataframe.iloc[:,dataframe.columns == target].values
    
    w = np.zeros(size)
    for k in range(steps):
        print('step: ',k)
        print('RMSE: ',compute_error(dataframe, target=target, what=w))
        for i in range(X.shape[1]):
            if (i == 0):
                w[i] = w[i] + ((np.sum(Y - np.dot(X, w))) / (X.shape[0]))
            else:
                temp_1 = (-np.matmul(X[:, i].T, (Y - (np.dot(X, w)))) + (l / 2))
                val_1 = temp_1 / np.matmul(X[:, i].T, X[:, i])
                val_2 = (-np.matmul(X[:, i].T, (Y - (np.dot(X, w)))) - l / 2) / np.matmul(X[:, i].T, X[:, i])
                if (val_1[0] < w[i]):
                    w[i] = w[i] - val_1[0]
                elif (w[i] < val_2[0]):
                    w[i] = w[i] - val_2[0]
                else:
                    w[i] = 0
    return w

In [3]:
def fit_ridge_regression(dataframe, target=None, steps=100 ,l=0.02):
    
    #split the X and Y from the dataframe
    size = len(list(dataframe)) - 1
    #print('within ridge')
    X = np.nan_to_num(dataframe.iloc[:, dataframe.columns != target].values)
    Y = dataframe.iloc[:,dataframe.columns == target].values
    
    Sigma = np.add(np.dot(X.T, X), (l * np.identity(size)))
    #print(Sigma)
    
    #compute sigma inverse
    try:
        Sigma_inverse = np.linalg.inv(Sigma)
        #print(Sigma_inverse)
    except LinAlgError:
        print('Matrix cannot be inversed')
        
    #compute w hat 
    what = np.dot(Sigma_inverse, np.dot(X.T, Y))
    
    return what

In [4]:
#compute MSE error
def compute_error(dataframe, target=None, what=None):
    
    err = 0
    size = len(list(dataframe)) -1
    rows = dataframe.shape[1]
    what = np.array(what).reshape((size,1))
    
    X = np.nan_to_num(dataframe.iloc[:, dataframe.columns != target].values)
    Y = dataframe.iloc[:,dataframe.columns == target].values
    
    err = np.sqrt(np.mean((Y - np.dot(X, what))**2))

    return err

In [5]:
def fill_missing_using_regression(dataframe=None, target=None, weights=[]):
    size = len(list(dataframe)) - 1
    weights = weights.reshape((size,1))
    
    na = np.where(np.isnan(dataframe[target]))[0]
    for idx in na:
        X = np.nan_to_num(dataframe.iloc[idx, dataframe.columns != target].values)
        dataframe.iloc[idx, dataframe.columns == target] = np.dot(X, weights)
        
    return dataframe

In [13]:
df = pd.read_csv('ml3_numeric_removed_na_rows.csv', encoding = 'ISO-8859-1')

In [7]:
print(df.shape)
df.head(15)

(2391, 182)


Unnamed: 0,Participant_ID,RowNumber,session_id,age,backcount1,backcount10,backcount2,backcount3,backcount4,backcount5,backcount6,backcount7,backcount8,backcount9,big5_01,big5_02,big5_03,big5_04,big5_05,big5_06,big5_07,big5_08,big5_09,big5_10,elm_01,elm_02,elm_03,elm_04,elm_05,gender,intrinsic_01,intrinsic_02,intrinsic_03,intrinsic_04,intrinsic_05,intrinsic_06,intrinsic_07,intrinsic_08,intrinsic_09,intrinsic_10,intrinsic_11,intrinsic_12,intrinsic_13,intrinsic_14,intrinsic_15,kposition,kratio,lposition,lratio,mcdv1,mcdv2,mcfiller1,mcfiller2,mcfiller3,mood_01,mood_02,nfc_01,nfc_02,nfc_03,nfc_04,nfc_05,nfc_06,nposition,nratio,pate_01,pate_02,pate_03,pate_04,pate_05,rposition,rratio,sarcasm,selfesteem_01,stress_01,stress_02,stress_03,stress_04,tempest2,tempest3,tempfollowup1,tempfollowup2,tempfollowup3,vposition,vratio,year,Temperatureinlab,ClipboardWeight,IIResponse,SRConfidenceResponse,NumberofDays,Pool2a,Pool2b,Pool2c,Pool2d,Pool3,Pool4,Pool5a,Pool6,Pool7b,Pool7c,Pool7d,Pool8,Pool9,Pool10,Pool11,Pool12,Pool13,Pool14,Pool15,Pool16a,Pool16b,Pool17,Pool18,Pool19a,Pool19b,Persistence,anagrams_order,attention_order,availinstruct_order,availk_order,availl_order,availn_order,availr_order,availv_order,bigfive_order,debrief_order,demographics_order,elmques_order,filler1_order,filler2_order,galinskyvignette_order,inlab_order,intrinsic_order,mcfiller_order,moninvignette_order,mood_order,nfc_order,participantid_order,participation_order,selfesteem_order,startpage_order,stress_order,stroop_order,stroopinstructions_order,stroopinstructionstest_order,stroopprac_order,tempestimate_order,tempfollowup_order,welcome_order,MonthComputer,DayComputer,YearComputer,DaysSinceMonthComputer,DaysSinceAugComputer,DaysSinceMonthLab,DaysSinceAugLab,DaysSinceMonthStart,DaysSinceAugStart,DaysInComp,DaysInLab,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,Intrinsic,Mood,NFC,ReportedAttention,ReportedEffort,SelfEsteem,Stress,K1st,L1st,N1st,R1st,V1st,AvailFirst,ArgumentQuality,NFCcenter,ELMCond,CBReject
0,12.0,170,7385046,19.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,5.0,3.0,6.0,5.0,5.0,3.0,5.0,1.0,5.0,5.0,7.0,7.0,7.0,6.0,6.0,1.0,3.0,2.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,14.0,2.0,18.0,0.0,3.0,1.0,0.0,2.0,3.0,3.0,3.0,4.0,4.0,3.0,2.0,3.0,1.0,6.0,4.0,4.0,1.0,1.0,1.0,2.0,15.0,4.0,4.0,3.0,2.0,3.0,4.0,6.0,2.0,7.0,7.0,6.0,1.0,3.0,2.0,74.0,20.0,6.0,5.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,113.000002,17.0,28.0,7,12,11,8,9,10,33.0,36.0,35.0,26,22,23,24,16.0,30.0,14,15,32.0,29.0,37.0,34.0,31.0,0,27.0,6.0,3,5.0,4,20,19,1,8,29,14,0,29,0.0,29.0,0,25,0.043478,0.043478,4.0,6.5,5.0,5.0,4.0,2.666667,5.0,2.5,4.0,4.0,4.0,3.5,0.0,0.0,1.0,0.0,1.0,2.0,6.6,-0.68254,1,0.0
1,13.0,173,7385155,18.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,3.0,5.0,7.0,7.0,2.0,7.0,2.0,1.0,1.0,7.0,9.0,9.0,9.0,9.0,9.0,1.0,1.0,1.0,1.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,2.0,4.0,1.0,4.0,2.0,4.0,2.0,8654.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,5.0,1.0,3.0,2.0,1.0,1.0,5.0,5.0,1.0,1.0,2.0,1.0,8.0,3.0,1.0,4.0,2.0,2.0,5.0,6.0,4.0,7.0,7.0,1.0,1.0,8.0,1.0,74.0,10.0,7.0,5.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,153.999999,2.0,30.0,13,16,18,17,14,15,32.0,36.0,35.0,5,20,21,22,23.0,28.0,25,26,33.0,27.0,37.0,31.0,29.0,0,34.0,9.0,6,8.0,7,12,11,1,8,29,14,0,29,0.0,29.0,0,25,0.043478,0.043478,1.5,7.0,2.0,2.5,7.0,2.533333,7.0,2.333333,5.0,5.0,1.0,4.25,0.0,0.0,1.0,1.0,1.0,3.0,9.0,-0.849206,1,0.0
2,14.0,179,7391990,18.0,357.0,130.0,354.0,351.0,248.0,245.0,242.0,139.0,136.0,133.0,7.0,6.0,7.0,5.0,7.0,3.0,7.0,1.0,7.0,2.0,9.0,9.0,9.0,9.0,7.0,1.0,2.0,2.0,1.0,4.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,3.0,4.0,2.0,4.0,2.0,30.0,2.0,30.0,3.0,3.0,2.0,1.0,1.0,2.0,1.0,5.0,5.0,1.0,5.0,5.0,5.0,2.0,30.0,4.0,5.0,1.0,1.0,1.0,2.0,30.0,1.0,7.0,1.0,5.0,5.0,3.0,7.0,3.0,7.0,7.0,6.0,2.0,30.0,1.0,72.0,20.0,7.0,5.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,239.999993,26.0,29.0,20,24,25,22,23,21,27.0,36.0,35.0,19,7,8,9,16.0,28.0,14,15,33.0,30.0,37.0,32.0,34.0,0,31.0,5.0,2,4.0,3,12,11,1,9,2,14,31,33,31.0,33.0,0,25,0.086957,0.086957,6.5,7.0,6.0,4.5,3.0,3.133333,6.5,3.666667,4.0,5.0,7.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,8.6,0.484127,1,0.0
3,15.0,180,7392153,18.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,7.0,4.0,5.0,5.0,5.0,1.0,7.0,3.0,5.0,2.0,5.0,6.0,5.0,5.0,5.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,3.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,7.0,0.0,3.0,1.0,0.0,1.0,3.0,2.0,4.0,4.0,4.0,1.0,1.0,1.0,2.0,4.0,4.0,3.0,1.0,1.0,2.0,1.0,3.0,3.0,2.0,5.0,2.0,2.0,4.0,4.0,2.0,6.0,6.0,5.0,2.0,5.0,1.0,73.0,10.0,7.0,4.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,79.000004,6.0,28.0,14,19,17,16,15,18,34.0,36.0,35.0,25,21,22,23,10.0,33.0,8,9,30.0,27.0,37.0,29.0,32.0,0,31.0,5.0,2,4.0,3,13,12,1,9,2,14,31,33,31.0,33.0,0,25,0.086957,0.086957,5.5,5.0,7.0,5.5,4.0,2.2,5.5,2.5,4.0,3.0,2.0,4.25,0.0,0.0,0.0,1.0,0.0,1.0,5.2,-0.68254,1,0.0
4,16.0,185,7392233,18.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,5.0,5.0,7.0,3.0,6.0,5.0,7.0,2.0,5.0,3.0,5.0,3.0,6.0,3.0,4.0,1.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,4.0,2.0,3.0,2.0,1.0,2.0,2.0,3.0,1.0,3.0,2.0,35.0,0.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,20.0,5.0,5.0,1.0,1.0,2.0,2.0,40.0,6.0,5.0,3.0,4.0,4.0,2.0,6.0,2.0,7.0,7.0,7.0,2.0,30.0,1.0,73.0,20.0,7.0,4.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,240.000001,6.0,30.0,21,23,25,24,26,22,34.0,36.0,35.0,19,9,10,11,20.0,29.0,16,17,33.0,31.0,37.0,27.0,32.0,0,28.0,5.0,2,4.0,3,14,13,1,9,2,14,31,33,31.0,33.0,0,25,0.086957,0.086957,5.5,6.5,4.0,5.0,3.0,2.666667,6.0,3.5,5.0,5.0,5.0,2.25,1.0,0.0,0.0,0.0,0.0,1.0,4.2,0.31746,1,0.0
5,17.0,195,7392698,24.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,6.0,5.0,3.0,2.0,6.0,5.0,2.0,3.0,6.0,5.0,7.0,8.0,5.0,4.0,4.0,2.0,2.0,3.0,2.0,4.0,2.0,4.0,3.0,4.0,3.0,3.0,2.0,2.0,4.0,2.0,4.0,2.0,10.0,1.0,10.0,-2.0,1.0,2.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0,2.0,3.0,2.0,1.0,10.0,2.0,4.0,1.0,1.0,3.0,1.0,10.0,2.0,5.0,4.0,3.0,4.0,3.0,6.0,4.0,6.0,5.0,4.0,2.0,10.0,2.0,73.0,10.0,2.0,4.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,151.999995,18.0,30.0,11,15,14,13,16,12,28.0,36.0,35.0,3,20,21,22,4.0,27.0,9,10,32.0,33.0,37.0,29.0,31.0,0,34.0,26.0,23,25.0,24,7,6,1,9,2,14,31,33,31.0,33.0,0,25,0.086957,0.086957,4.5,4.0,4.5,2.5,2.0,2.666667,5.0,3.0,2.0,4.0,5.0,3.0,0.0,1.0,1.0,1.0,0.0,3.0,5.6,-0.18254,1,0.0
6,19.0,197,7392875,18.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,3.0,4.0,5.0,6.0,3.0,5.0,2.0,1.0,4.0,5.0,6.0,7.0,5.0,6.0,7.0,1.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,4.0,2.0,1.0,1.0,4.0,4.0,2.0,1.0,7.0,1.0,13.0,1.0,2.0,2.0,0.0,2.0,6.0,6.0,4.0,1.0,2.0,5.0,4.0,2.0,2.0,14.0,4.0,4.0,1.0,1.0,2.0,2.0,20.0,3.0,2.0,5.0,3.0,2.0,4.0,5.0,3.0,6.0,7.0,5.0,2.0,20.0,1.0,73.0,10.0,6.0,5.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,240.000002,9.0,27.0,14,18,19,15,17,16,28.0,36.0,35.0,3,24,25,26,10.0,32.0,12,13,30.0,33.0,37.0,34.0,31.0,0,29.0,8.0,5,7.0,6,22,21,1,9,2,14,31,33,31.0,33.0,0,25,0.086957,0.086957,3.0,6.0,3.0,3.0,5.0,2.866667,2.0,4.333333,4.0,4.0,2.0,4.0,1.0,1.0,0.0,0.0,0.0,2.0,6.2,1.150794,-1,0.0
7,20.0,201,7397218,19.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,3.0,5.0,4.0,5.0,5.0,5.0,5.0,2.0,6.0,4.0,7.0,7.0,8.0,7.0,8.0,2.0,2.0,1.0,1.0,3.0,2.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,2.0,4.0,1.0,5.0,2.0,8.0,-1.0,-1.0,1.0,0.0,1.0,4.0,4.0,2.0,4.0,3.0,2.0,3.0,2.0,2.0,10.0,4.0,4.0,1.0,1.0,2.0,1.0,10.0,5.0,5.0,3.0,4.0,3.0,3.0,6.0,3.0,7.0,7.0,6.0,2.0,5.0,1.0,74.0,20.0,6.0,5.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,240.000002,17.0,27.0,11,13,14,12,16,15,31.0,36.0,35.0,3,20,21,22,18.0,32.0,5,6,30.0,34.0,37.0,29.0,33.0,0,28.0,10.0,7,9.0,8,25,24,1,9,4,14,31,35,31.0,35.0,0,25,0.108696,0.108696,4.5,5.0,3.0,4.0,3.5,2.266667,4.0,2.666667,4.0,4.0,5.0,2.75,1.0,0.0,0.0,1.0,0.0,2.0,7.4,-0.515873,1,0.0
8,21.0,205,7397377,20.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,6.0,5.0,5.0,2.0,5.0,3.0,6.0,3.0,4.0,2.0,7.0,8.0,7.0,6.0,6.0,1.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,3.0,3.0,2.0,2.0,3.0,3.0,2.0,3.0,1.0,3.0,1.0,2.0,0.0,-1.0,2.0,0.0,2.0,2.0,2.0,3.0,2.0,4.0,3.0,3.0,4.0,1.0,4.0,3.0,4.0,1.0,4.0,1.0,1.0,4.0,3.0,4.0,2.0,4.0,4.0,2.0,6.0,3.0,6.0,6.0,6.0,1.0,2.0,2.0,75.0,10.0,6.0,5.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,221.000003,19.0,27.0,2,6,3,4,7,5,31.0,36.0,35.0,25,9,10,11,20.0,34.0,13,14,28.0,29.0,37.0,30.0,33.0,0,32.0,18.0,15,17.0,16,23,22,1,9,4,14,31,35,31.0,35.0,0,25,0.108696,0.108696,5.5,5.0,5.5,4.5,3.0,2.6,6.0,2.833333,3.0,4.0,4.0,2.0,1.0,1.0,1.0,1.0,1.0,5.0,6.8,-0.349206,-1,1.0
9,22.0,209,7397497,18.0,357.0,330.0,354.0,351.0,348.0,245.0,342.0,339.0,336.0,333.0,7.0,3.0,7.0,3.0,4.0,1.0,6.0,3.0,3.0,2.0,6.0,3.0,1.0,4.0,5.0,1.0,3.0,3.0,3.0,4.0,2.0,2.0,2.0,3.0,4.0,2.0,3.0,1.0,3.0,3.0,4.0,2.0,15.0,1.0,8.0,1.0,2.0,2.0,0.0,1.0,5.0,5.0,3.0,2.0,4.0,3.0,4.0,4.0,2.0,11.0,4.0,5.0,1.0,1.0,2.0,1.0,8.0,5.0,5.0,3.0,3.0,4.0,4.0,4.0,2.0,6.0,6.0,4.0,2.0,12.0,1.0,75.0,20.0,7.0,4.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,155.000002,2.0,33.0,14,15,17,19,18,16,30.0,36.0,35.0,8,11,12,13,23.0,31.0,21,22,32.0,34.0,37.0,28.0,29.0,0,27.0,6.0,3,5.0,4,26,25,1,9,4,14,31,35,31.0,35.0,0,25,0.108696,0.108696,5.0,6.0,7.0,5.5,4.0,2.933333,3.0,3.0,4.0,5.0,5.0,3.0,0.0,1.0,0.0,1.0,0.0,2.0,3.8,-0.18254,-1,0.0


In [8]:
numerical_features = ['Participant_ID', 'RowNumber', 'session_id', 'age', 'backcount1', 'backcount10', 'backcount2', 'backcount3', 'backcount4', 'backcount5', 'backcount6', 'backcount7', 'backcount8', 'backcount9', 'kratio', 'lratio', 'nratio', 'rratio', 'vratio', 'Temperatureinlab', 'NumberofDays', 'Persistence', 'anagrams_order', 'attention_order', 'availinstruct_order', 'availk_order', 'availl_order', 'availn_order', 'availr_order', 'availv_order', 'bigfive_order', 'debrief_order', 'demographics_order', 'elmques_order', 'filler1_order', 'filler2_order', 'galinskyvignette_order', 'inlab_order', 'intrinsic_order', 'mcfiller_order', 'moninvignette_order', 'mood_order', 'nfc_order', 'participantid_order', 'participation_order', 'selfesteem_order', 'startpage_order', 'stress_order', 'stroop_order', 'stroopinstructions_order', 'stroopinstructionstest_order', 'stroopprac_order', 'tempestimate_order', 'tempfollowup_order', 'welcome_order', 'MonthComputer', 'DayComputer', 'YearComputer', 'DaysSinceMonthComputer', 'DaysSinceAugComputer', 'DaysSinceMonthLab', 'DaysSinceAugLab', 'DaysSinceMonthStart', 'DaysSinceAugStart', 'DaysInComp', 'DaysInLab', 'Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism', 'Intrinsic', 'Mood', 'NFC', 'ReportedAttention', 'ReportedEffort', 'SelfEsteem', 'Stress', 'ArgumentQuality']
categorical_features = ['big5_01', 'big5_02', 'big5_03', 'big5_04', 'big5_05', 'big5_06', 'big5_07', 'big5_08', 'big5_09', 'big5_10', 'elm_01', 'elm_02', 'elm_03', 'elm_04', 'elm_05', 'gender', 'intrinsic_01', 'intrinsic_02', 'intrinsic_03', 'intrinsic_04', 'intrinsic_05', 'intrinsic_06', 'intrinsic_07', 'intrinsic_08', 'intrinsic_09', 'intrinsic_10', 'intrinsic_11', 'intrinsic_12', 'intrinsic_13', 'intrinsic_14', 'intrinsic_15', 'kposition', 'lposition', 'mcdv1', 'mcdv2', 'mcfiller1', 'mcfiller2', 'mcfiller3', 'mood_01', 'mood_02', 'nfc_01', 'nfc_02', 'nfc_03', 'nfc_04', 'nfc_05', 'nfc_06', 'nposition', 'pate_01', 'pate_02', 'pate_03', 'pate_04', 'pate_05', 'rposition', 'sarcasm', 'selfesteem_01', 'stress_01', 'stress_02', 'stress_03', 'stress_04', 'tempest2', 'tempest3', 'tempfollowup1', 'tempfollowup2', 'tempfollowup3', 'vposition', 'year', 'ClipboardWeight', 'IIResponse', 'SRConfidenceResponse', 'Pool2a', 'Pool2b', 'Pool2c', 'Pool2d', 'Pool3', 'Pool4', 'Pool5a', 'Pool6', 'Pool7b', 'Pool7c', 'Pool7d', 'Pool8', 'Pool9', 'Pool10', 'Pool11', 'Pool12', 'Pool13', 'Pool14', 'Pool15', 'Pool16a', 'Pool16b', 'Pool17', 'Pool18', 'Pool19a', 'Pool19b', 'K1st', 'L1st', 'N1st', 'R1st', 'V1st', 'AvailFirst', 'ELMCond', 'CBReject']

In [9]:
feature_list = list(df)
print(feature_list)

['Participant_ID', 'RowNumber', 'session_id', 'age', 'backcount1', 'backcount10', 'backcount2', 'backcount3', 'backcount4', 'backcount5', 'backcount6', 'backcount7', 'backcount8', 'backcount9', 'big5_01', 'big5_02', 'big5_03', 'big5_04', 'big5_05', 'big5_06', 'big5_07', 'big5_08', 'big5_09', 'big5_10', 'elm_01', 'elm_02', 'elm_03', 'elm_04', 'elm_05', 'gender', 'intrinsic_01', 'intrinsic_02', 'intrinsic_03', 'intrinsic_04', 'intrinsic_05', 'intrinsic_06', 'intrinsic_07', 'intrinsic_08', 'intrinsic_09', 'intrinsic_10', 'intrinsic_11', 'intrinsic_12', 'intrinsic_13', 'intrinsic_14', 'intrinsic_15', 'kposition', 'kratio', 'lposition', 'lratio', 'mcdv1', 'mcdv2', 'mcfiller1', 'mcfiller2', 'mcfiller3', 'mood_01', 'mood_02', 'nfc_01', 'nfc_02', 'nfc_03', 'nfc_04', 'nfc_05', 'nfc_06', 'nposition', 'nratio', 'pate_01', 'pate_02', 'pate_03', 'pate_04', 'pate_05', 'rposition', 'rratio', 'sarcasm', 'selfesteem_01', 'stress_01', 'stress_02', 'stress_03', 'stress_04', 'tempest2', 'tempest3', 'tempf

In [10]:
sampledf = df.copy()

In [30]:
#PCA
from numpy import array
from numpy import mean
from numpy import cov
from numpy.linalg import eig
import matplotlib.pyplot as plt
%matplotlib inline
# define a matrix
#A = array([[1, 2], [3, 4], [5, 6]])
#print(A)
sampledf = df.copy()
sampledf = sampledf.dropna()

"""means = []
sampledf = sampledf.fillna(sampledf.mean())
for c in list(sampledf):
    mean = sampledf[c].mean()
    sampledf[c] /= mean
    means.append(mean) """

A = np.nan_to_num(sampledf.values.astype(float))
#print(A.shape)
# calculate the mean of each column
M = mean(A.T, axis=1)
#print(M)
# center columns by subtracting column means
C = A - M
#print(C)
# calculate covariance matrix of centered matrix
V = cov(C.T)
#print(V)
# eigendecomposition of covariance matrix
values, vectors = eig(V)
#print(vectors)
#print(values)
# project data
P = vectors.T.dot(C.T)
#print(P.T)
significance = [np.abs(i)/np.sum(values) for i in values]

68
['Participant_ID', 'RowNumber', 'session_id', 'age', 'backcount1', 'backcount10', 'backcount2', 'backcount3', 'backcount4', 'backcount5', 'backcount7', 'backcount8', 'backcount9', 'big5_01', 'big5_02']


# Applying Ridge regression

Steps:
    - Select target column, Participant_ID in this example below
    - Remove rows in the df where na is present in target column
    - Split data frame into train/test
    - Train model and test on test data
    - If the error rate is low enough, predict & impute the values of na rows above.

In [20]:
test_train_split_ratio = 0.8
feature = feature_list[0]
models = {}
predict_na = sampledf.copy()
predict_na = one_hot_encoding(dataframe=predict_na, features=categorical_features)



def training(sampledf=None, feature=None, test_train_split_ratio=0.8, cross_value=5, models={}):
    prev = 100000000
    if sum(np.isnan(sampledf.iloc[:, sampledf.columns == feature].values)) != 0:

        #remove rows with NA in the target column
        sampledf = sampledf.dropna(subset = [feature])
        
        #If using PCA features
        #imp_feature_list = ['Participant_ID', 'RowNumber', 'session_id', 'age', 'backcount1', 'backcount10', 'backcount2', 'backcount3', 'backcount4', 'backcount5', 'backcount6', 'backcount7', 'backcount8', 'backcount9', 'big5_01', 'big5_02', 'big5_03', 'big5_04', 'big5_05', 'big5_06', 'big5_07', 'big5_08', 'big5_09', 'big5_10', 'elm_01', 'elm_02', 'elm_03', 'elm_04', 'elm_05', 'gender', 'intrinsic_01', 'intrinsic_02', 'intrinsic_03', 'intrinsic_04', 'intrinsic_05', 'intrinsic_06', 'intrinsic_07', 'intrinsic_08', 'intrinsic_09', 'intrinsic_10', 'intrinsic_11', 'intrinsic_12', 'intrinsic_13', 'intrinsic_14', 'intrinsic_15', 'kposition', 'kratio', 'lposition', 'lratio', 'mcdv1', 'mcdv2', 'mcfiller1', 'mcfiller2', 'mcfiller3', 'mood_01', 'mood_02', 'nfc_01', 'nfc_02', 'nfc_03', 'nfc_04', 'nfc_05', 'nfc_06', 'nposition', 'nratio', 'pate_01', 'pate_02', 'pate_03', 'pate_04', 'pate_05', 'rposition', 'rratio', 'sarcasm', 'selfesteem_01', 'stress_01', 'stress_02', 'stress_03', 'stress_04', 'tempest2', 'tempest3', 'tempfollowup1', 'tempfollowup2', 'tempfollowup3', 'vposition', 'vratio', 'year', 'Temperatureinlab', 'ClipboardWeight', 'IIResponse', 'SRConfidenceResponse', 'NumberofDays', 'Pool2a', 'Pool2b', 'Pool2c', 'Pool2d', 'Pool3', 'Pool4', 'Pool5a', 'Pool6', 'Pool7b', 'Pool7c', 'Pool7d', 'Pool8', 'Pool9', 'Pool10', 'Pool11', 'Pool12', 'Pool13', 'Pool14', 'Pool15', 'Pool16a', 'Pool16b', 'Pool17', 'Pool18', 'Pool19a', 'Pool19b', 'Persistence', 'anagrams_order', 'attention_order', 'availinstruct_order', 'availk_order', 'availl_order', 'availn_order', 'availr_order', 'availv_order', 'bigfive_order']
        
        #if target not in imp_feature_list:
        #    features = imp_feature_list
        #else:
        #    imp_feature_list.remove(target)
        #    features = imp_feature_list

        
        #fill NA in featuer column with the mode of the columns
        #sampledf = sampledf.fillna(sampledf.mean())
        #for f in list(sampledf):
        #    sampledf[f] = sampledf[f].fillna(sampledf[f].mode()[0])

        
        #normalize the values column wise
        #means = []
        #sampledf = sampledf.fillna(sampledf.mean())
        #for c in list(sampledf):
        #    mean = sampledf[c].mean()
        #    sampledf[c] /= mean
        #    means.append(mean)  
        
        sampledf = one_hot_encoding(dataframe=sampledf, features=categorical_features)

        model = []
        rmse = []

        for cross in range(5):
            #shuffle your data frame
            sampledf = sampledf.sample(frac=1)

            #split the sample dataframe
            test_train_split_index = int(sampledf.shape[0]  * test_train_split_ratio)
            train = sampledf.iloc[:test_train_split_index, :]
            test = sampledf.iloc[test_train_split_index:, :]

            #train model
            model_w = fit_ridge_regression(train, target=feature, steps=10)
            #model_w = fit_lasso_regression(train, target=feature, step=10)


            rmse.append(np.abs(compute_error(test, target=feature, what=model_w) - compute_error(train, target=feature, what=model_w)))
            model.append(model_w)

        models[feature] = {}
        models[feature]['model'] = model[np.argmin(rmse)]
        models[feature]['rmse'] = rmse[np.argmin(rmse)]
        
        fill_missing_using_regression(dataframe=predict_na, target=feature, weights=model[np.argmin(rmse)])

    #print(models)

In [None]:
sampledf = df.copy()
models = {}
feature = feature_list[0]

for feature in numerical_features:
    print(feature)
    training(sampledf=sampledf, feature=feature, cross_value=1000, models={})
    
predict_na = one_hot_encoding(dataframe=predict_na, features=categorical_features)

In [32]:
predict_na.to_csv('ml3_after_data_imputation.csv', index=False)

In [None]:
#sampledf = df.copy()
feature = list(sampledf)[0]
na = np.where(np.isnan(sampledf[feature]))[0]
len(na)


In [None]:
predict_na['Participant_ID'].value_counts(dropna=False)

In [None]:
sampledf = df.copy()
sampledf = sampledf['age'].fillna(sampledf['age'].mean())
sampledf /= sampledf.mean()
sampledf