In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)
# importing some basic libraries
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np

In [86]:
def one_hot_encoding(dataframe, features=[]):

    for feature in features:
        temp_dataframe = pd.get_dummies(dataframe[feature], prefix=feature)
        dataframe = pd.concat([dataframe, temp_dataframe], axis=1)
        dataframe_dash = dataframe
        
    dataframe = dataframe.drop(columns=features)

    return dataframe


def one_hot_decoding(original_dataframe_features=None, encoded_dataframe=None):
    encoded_list = list(encoded_dataframe)
    
    encoded_feature_list = [x.split('_')[-1] for x in encoded_list]
    encoded_feature_list = np.array(encoded_feature_list, dtype='float')
    
    output = encoded_dataframe.values.astype(float)
    output = np.multiply(output, encoded_feature_list)
    
    decode_df = pd.DataFrame(output, columns=encoded_list)

    for feature in original_dataframe_features:
        l = [x for x in encoded_list if feature in x]
        decode_df[feature] = decode_df[l].max(axis=1)

    return decode_df.drop(columns=encoded_list)

def accuracy(y=None, y_pred=None):
    return np.mean(y == y_pred)

In [2]:
# Random Forest Algorithm on Sonar Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt
import pandas as pd

# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds,*args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		#print("The train set is ",train_set)
		#print("The test set is ",test_set)
		predicted,trees = algorithm(train_set, test_set,*args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores,trees

# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
	# count all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / n_instances)
	return gini

# Select the best split point for a dataset
def get_split(dataset, n_features):
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	features = list()
	while len(features) < n_features:
		index = randrange(len(dataset[0])-1)
		if index not in features:
			features.append(index)
	for index in features:
		for row in dataset:
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value
def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)

# Create child splits for a node or make terminal
def split(node, max_depth, min_size, n_features, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# process left child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left, n_features)
		split(node['left'], max_depth, min_size, n_features, depth+1)
	# process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right, n_features)
		split(node['right'], max_depth, min_size, n_features, depth+1)

# Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
	root = get_split(train, n_features)
	split(root, max_depth, min_size, n_features, 1)
	return root

# Make a prediction with a decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']

# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
	sample = list()
	n_sample = round(len(dataset) * ratio)
	while len(sample) < n_sample:
		index = randrange(len(dataset))
		sample.append(dataset[index])
	return sample

# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
	predictions = [predict(tree, row) for tree in trees]
	return max(set(predictions), key=predictions.count)

# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
	trees = list()
	for i in range(n_trees):
		sample = subsample(train, sample_size)
		tree = build_tree(sample, max_depth, min_size, n_features)
		trees.append(tree)
		#print(trees)
	predictions = [bagging_predict(trees, row) for row in test]
	#print(predictions)
	return(predictions,trees)

# Test the random forest algorithm
seed(2)
# load and prepare data
#dataset = pd.read_csv("ML3AllSites_updated_sampleset.csv",encoding = "ISO-8859-1")
 



In [1]:
from __future__ import division, print_function
import numpy as np
import cvxopt
from mlfromscratch.utils import train_test_split, normalize, accuracy_score
from mlfromscratch.utils.kernels import *
from mlfromscratch.utils import Plot

# Hide cvxopt output
cvxopt.solvers.options['show_progress'] = False

class SupportVectorMachine(object):
    """The Support Vector Machine classifier.
    Uses cvxopt to solve the quadratic optimization problem.
    Parameters:
    -----------
    C: float
        Penalty term.
    kernel: function
        Kernel function. Can be either polynomial, rbf or linear.
    power: int
        The degree of the polynomial kernel. Will be ignored by the other
        kernel functions.
    gamma: float
        Used in the rbf kernel function.
    coef: float
        Bias term used in the polynomial kernel function.
    """
    def __init__(self, C=1, kernel=rbf_kernel, power=4, gamma=None, coef=4):
        self.C = C
        self.kernel = kernel
        self.power = power
        self.gamma = gamma
        self.coef = coef
        self.lagr_multipliers = None
        self.support_vectors = None
        self.support_vector_labels = None
        self.intercept = None

    def fit(self, X, y):

        n_samples, n_features = np.shape(X)

        # Set gamma to 1/n_features by default
        if not self.gamma:
            self.gamma = 1 / n_features

        # Initialize kernel method with parameters
        self.kernel = self.kernel(
            power=self.power,
            gamma=self.gamma,
            coef=self.coef)

        # Calculate kernel matrix
        kernel_matrix = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                kernel_matrix[i, j] = self.kernel(X[i], X[j])

        # Define the quadratic optimization problem
        P = cvxopt.matrix(np.outer(y, y) * kernel_matrix, tc='d')
        q = cvxopt.matrix(np.ones(n_samples) * -1)
        A = cvxopt.matrix(y, (1, n_samples), tc='d')
        b = cvxopt.matrix(0, tc='d')

        if not self.C:
            G = cvxopt.matrix(np.identity(n_samples) * -1)
            h = cvxopt.matrix(np.zeros(n_samples))
        else:
            G_max = np.identity(n_samples) * -1
            G_min = np.identity(n_samples)
            G = cvxopt.matrix(np.vstack((G_max, G_min)))
            h_max = cvxopt.matrix(np.zeros(n_samples))
            h_min = cvxopt.matrix(np.ones(n_samples) * self.C)
            h = cvxopt.matrix(np.vstack((h_max, h_min)))

        # Solve the quadratic optimization problem using cvxopt
        minimization = cvxopt.solvers.qp(P, q, G, h, A, b)

        # Lagrange multipliers
        lagr_mult = np.ravel(minimization['x'])

        # Extract support vectors
        # Get indexes of non-zero lagr. multipiers
        idx = lagr_mult > 1e-7
        # Get the corresponding lagr. multipliers
        self.lagr_multipliers = lagr_mult[idx]
        # Get the samples that will act as support vectors
        self.support_vectors = X[idx]
        # Get the corresponding labels
        self.support_vector_labels = y[idx]

        # Calculate intercept with first support vector
        self.intercept = self.support_vector_labels[0]
        for i in range(len(self.lagr_multipliers)):
            self.intercept -= self.lagr_multipliers[i] * self.support_vector_labels[
                i] * self.kernel(self.support_vectors[i], self.support_vectors[0])

    def predict(self, X):
        y_pred = []
        # Iterate through list of samples and make predictions
        for sample in X:
            prediction = 0
            # Determine the label of the sample by the support vectors
            for i in range(len(self.lagr_multipliers)):
                prediction += self.lagr_multipliers[i] * self.support_vector_labels[
                    i] * self.kernel(self.support_vectors[i], sample)
            prediction += self.intercept
            y_pred.append(np.sign(prediction))
        return np.array(y_pred)

In [152]:
numerical_features = ['Participant_ID', 'RowNumber', 'session_id', 'age', 'backcount1', 'backcount10', 'backcount2', 'backcount3', 'backcount4', 'backcount5', 'backcount6', 'backcount7', 'backcount8', 'backcount9', 'kratio', 'lratio', 'nratio', 'rratio', 'vratio', 'Temperatureinlab', 'NumberofDays', 'Persistence', 'anagrams_order', 'attention_order', 'availinstruct_order', 'availk_order', 'availl_order', 'availn_order', 'availr_order', 'availv_order', 'bigfive_order', 'debrief_order', 'demographics_order', 'elmques_order', 'filler1_order', 'filler2_order', 'galinskyvignette_order', 'inlab_order', 'intrinsic_order', 'mcfiller_order', 'moninvignette_order', 'mood_order', 'nfc_order', 'participantid_order', 'participation_order', 'selfesteem_order', 'startpage_order', 'stress_order', 'stroop_order', 'stroopinstructions_order', 'stroopinstructionstest_order', 'stroopprac_order', 'tempestimate_order', 'tempfollowup_order', 'welcome_order', 'MonthComputer', 'DayComputer', 'YearComputer', 'DaysSinceMonthComputer', 'DaysSinceAugComputer', 'DaysSinceMonthLab', 'DaysSinceAugLab', 'DaysSinceMonthStart', 'DaysSinceAugStart', 'DaysInComp', 'DaysInLab', 'Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism', 'Intrinsic', 'Mood', 'NFC', 'ReportedAttention', 'ReportedEffort', 'SelfEsteem', 'Stress', 'ArgumentQuality']
categorical_features = ['big5_01', 'big5_02', 'big5_03', 'big5_04', 'big5_05', 'big5_06', 'big5_07', 'big5_08', 'big5_09', 'big5_10', 'elm_01', 'elm_02', 'elm_03', 'elm_04', 'elm_05', 'gender', 'intrinsic_01', 'intrinsic_02', 'intrinsic_03', 'intrinsic_04', 'intrinsic_05', 'intrinsic_06', 'intrinsic_07', 'intrinsic_08', 'intrinsic_09', 'intrinsic_10', 'intrinsic_11', 'intrinsic_12', 'intrinsic_13', 'intrinsic_14', 'intrinsic_15', 'kposition', 'lposition', 'mcdv1', 'mcdv2', 'mcfiller1', 'mcfiller2', 'mcfiller3', 'mood_01', 'mood_02', 'nfc_01', 'nfc_02', 'nfc_03', 'nfc_04', 'nfc_05', 'nfc_06', 'nposition', 'pate_01', 'pate_02', 'pate_03', 'pate_04', 'pate_05', 'rposition', 'sarcasm', 'selfesteem_01', 'stress_01', 'stress_02', 'stress_03', 'stress_04', 'tempest2', 'tempest3', 'tempfollowup1', 'tempfollowup2', 'tempfollowup3', 'vposition', 'year', 'ClipboardWeight', 'IIResponse', 'SRConfidenceResponse', 'Pool2a', 'Pool2b', 'Pool2c', 'Pool2d', 'Pool3', 'Pool4', 'Pool5a', 'Pool6', 'Pool7b', 'Pool7c', 'Pool7d', 'Pool8', 'Pool9', 'Pool10', 'Pool11', 'Pool12', 'Pool13', 'Pool14', 'Pool15', 'Pool16a', 'Pool16b', 'Pool17', 'Pool18', 'Pool19a', 'Pool19b', 'K1st', 'L1st', 'N1st', 'R1st', 'V1st', 'AvailFirst', 'ELMCond', 'CBReject']


In [88]:
print(list(df))

['Participant_ID', 'RowNumber', 'session_id', 'age', 'backcount1', 'backcount10', 'backcount2', 'backcount3', 'backcount4', 'backcount5', 'backcount6', 'backcount7', 'backcount8', 'backcount9', 'big5_01', 'big5_02', 'big5_03', 'big5_04', 'big5_05', 'big5_06', 'big5_07', 'big5_08', 'big5_09', 'big5_10', 'elm_01', 'elm_02', 'elm_03', 'elm_04', 'elm_05', 'gender', 'intrinsic_01', 'intrinsic_02', 'intrinsic_03', 'intrinsic_04', 'intrinsic_05', 'intrinsic_06', 'intrinsic_07', 'intrinsic_08', 'intrinsic_09', 'intrinsic_10', 'intrinsic_11', 'intrinsic_12', 'intrinsic_13', 'intrinsic_14', 'intrinsic_15', 'kposition', 'kratio', 'lposition', 'lratio', 'mcdv1', 'mcdv2', 'mcfiller1', 'mcfiller2', 'mcfiller3', 'mood_01', 'mood_02', 'nfc_01', 'nfc_02', 'nfc_03', 'nfc_04', 'nfc_05', 'nfc_06', 'nposition', 'nratio', 'pate_01', 'pate_02', 'pate_03', 'pate_04', 'pate_05', 'rposition', 'rratio', 'sarcasm', 'selfesteem_01', 'stress_01', 'stress_02', 'stress_03', 'stress_04', 'tempest2', 'tempest3', 'tempf

In [196]:
#df = pd.read_csv('data/ml3_generate_data_approach_2.csv', encoding = 'ISO-8859-1')
df = pd.read_csv('data/ml3_numeric_removed_na_rows.csv', encoding = 'ISO-8859-1')
df.head()

Unnamed: 0,Participant_ID,RowNumber,session_id,age,backcount1,backcount10,backcount2,backcount3,backcount4,backcount5,backcount6,backcount7,backcount8,backcount9,big5_01,big5_02,big5_03,big5_04,big5_05,big5_06,big5_07,big5_08,big5_09,big5_10,elm_01,elm_02,elm_03,elm_04,elm_05,gender,intrinsic_01,intrinsic_02,intrinsic_03,intrinsic_04,intrinsic_05,intrinsic_06,intrinsic_07,intrinsic_08,intrinsic_09,intrinsic_10,intrinsic_11,intrinsic_12,intrinsic_13,intrinsic_14,intrinsic_15,kposition,kratio,lposition,lratio,mcdv1,mcdv2,mcfiller1,mcfiller2,mcfiller3,mood_01,mood_02,nfc_01,nfc_02,nfc_03,nfc_04,nfc_05,nfc_06,nposition,nratio,pate_01,pate_02,pate_03,pate_04,pate_05,rposition,rratio,sarcasm,selfesteem_01,stress_01,stress_02,stress_03,stress_04,tempest2,tempest3,tempfollowup1,tempfollowup2,tempfollowup3,vposition,vratio,year,Temperatureinlab,ClipboardWeight,IIResponse,SRConfidenceResponse,NumberofDays,Pool2a,Pool2b,Pool2c,Pool2d,Pool3,Pool4,Pool5a,Pool6,Pool7b,Pool7c,Pool7d,Pool8,Pool9,Pool10,Pool11,Pool12,Pool13,Pool14,Pool15,Pool16a,Pool16b,Pool17,Pool18,Pool19a,Pool19b,Persistence,anagrams_order,attention_order,availinstruct_order,availk_order,availl_order,availn_order,availr_order,availv_order,bigfive_order,debrief_order,demographics_order,elmques_order,filler1_order,filler2_order,galinskyvignette_order,inlab_order,intrinsic_order,mcfiller_order,moninvignette_order,mood_order,nfc_order,participantid_order,participation_order,selfesteem_order,startpage_order,stress_order,stroop_order,stroopinstructions_order,stroopinstructionstest_order,stroopprac_order,tempestimate_order,tempfollowup_order,welcome_order,MonthComputer,DayComputer,YearComputer,DaysSinceMonthComputer,DaysSinceAugComputer,DaysSinceMonthLab,DaysSinceAugLab,DaysSinceMonthStart,DaysSinceAugStart,DaysInComp,DaysInLab,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,Intrinsic,Mood,NFC,ReportedAttention,ReportedEffort,SelfEsteem,Stress,K1st,L1st,N1st,R1st,V1st,AvailFirst,ArgumentQuality,NFCcenter,ELMCond,CBReject
0,12.0,170,7385046,19.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,5.0,3.0,6.0,5.0,5.0,3.0,5.0,1.0,5.0,5.0,7.0,7.0,7.0,6.0,6.0,1.0,3.0,2.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,2.0,14.0,2.0,18.0,0.0,3.0,1.0,0.0,2.0,3.0,3.0,3.0,4.0,4.0,3.0,2.0,3.0,1.0,6.0,4.0,4.0,1.0,1.0,1.0,2.0,15.0,4.0,4.0,3.0,2.0,3.0,4.0,6.0,2.0,7.0,7.0,6.0,1.0,3.0,2.0,74.0,20.0,6.0,5.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,113.000002,17.0,28.0,7,12,11,8,9,10,33.0,36.0,35.0,26,22,23,24,16.0,30.0,14,15,32.0,29.0,37.0,34.0,31.0,0,27.0,6.0,3,5.0,4,20,19,1,8,29,14,0,29,0.0,29.0,0,25,0.043478,0.043478,4.0,6.5,5.0,5.0,4.0,2.666667,5.0,2.5,4.0,4.0,4.0,3.5,0.0,0.0,1.0,0.0,1.0,2.0,6.6,-0.68254,1,0.0
1,13.0,173,7385155,18.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,3.0,5.0,7.0,7.0,2.0,7.0,2.0,1.0,1.0,7.0,9.0,9.0,9.0,9.0,9.0,1.0,1.0,1.0,1.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,2.0,4.0,1.0,4.0,2.0,4.0,2.0,8654.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,5.0,1.0,3.0,2.0,1.0,1.0,5.0,5.0,1.0,1.0,2.0,1.0,8.0,3.0,1.0,4.0,2.0,2.0,5.0,6.0,4.0,7.0,7.0,1.0,1.0,8.0,1.0,74.0,10.0,7.0,5.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,153.999999,2.0,30.0,13,16,18,17,14,15,32.0,36.0,35.0,5,20,21,22,23.0,28.0,25,26,33.0,27.0,37.0,31.0,29.0,0,34.0,9.0,6,8.0,7,12,11,1,8,29,14,0,29,0.0,29.0,0,25,0.043478,0.043478,1.5,7.0,2.0,2.5,7.0,2.533333,7.0,2.333333,5.0,5.0,1.0,4.25,0.0,0.0,1.0,1.0,1.0,3.0,9.0,-0.849206,1,0.0
2,14.0,179,7391990,18.0,357.0,130.0,354.0,351.0,248.0,245.0,242.0,139.0,136.0,133.0,7.0,6.0,7.0,5.0,7.0,3.0,7.0,1.0,7.0,2.0,9.0,9.0,9.0,9.0,7.0,1.0,2.0,2.0,1.0,4.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,3.0,4.0,2.0,4.0,2.0,30.0,2.0,30.0,3.0,3.0,2.0,1.0,1.0,2.0,1.0,5.0,5.0,1.0,5.0,5.0,5.0,2.0,30.0,4.0,5.0,1.0,1.0,1.0,2.0,30.0,1.0,7.0,1.0,5.0,5.0,3.0,7.0,3.0,7.0,7.0,6.0,2.0,30.0,1.0,72.0,20.0,7.0,5.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,239.999993,26.0,29.0,20,24,25,22,23,21,27.0,36.0,35.0,19,7,8,9,16.0,28.0,14,15,33.0,30.0,37.0,32.0,34.0,0,31.0,5.0,2,4.0,3,12,11,1,9,2,14,31,33,31.0,33.0,0,25,0.086957,0.086957,6.5,7.0,6.0,4.5,3.0,3.133333,6.5,3.666667,4.0,5.0,7.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,8.6,0.484127,1,0.0
3,15.0,180,7392153,18.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,7.0,4.0,5.0,5.0,5.0,1.0,7.0,3.0,5.0,2.0,5.0,6.0,5.0,5.0,5.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,3.0,3.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,2.0,3.0,2.0,7.0,0.0,3.0,1.0,0.0,1.0,3.0,2.0,4.0,4.0,4.0,1.0,1.0,1.0,2.0,4.0,4.0,3.0,1.0,1.0,2.0,1.0,3.0,3.0,2.0,5.0,2.0,2.0,4.0,4.0,2.0,6.0,6.0,5.0,2.0,5.0,1.0,73.0,10.0,7.0,4.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,79.000004,6.0,28.0,14,19,17,16,15,18,34.0,36.0,35.0,25,21,22,23,10.0,33.0,8,9,30.0,27.0,37.0,29.0,32.0,0,31.0,5.0,2,4.0,3,13,12,1,9,2,14,31,33,31.0,33.0,0,25,0.086957,0.086957,5.5,5.0,7.0,5.5,4.0,2.2,5.5,2.5,4.0,3.0,2.0,4.25,0.0,0.0,0.0,1.0,0.0,1.0,5.2,-0.68254,1,0.0
4,16.0,185,7392233,18.0,357.0,330.0,354.0,351.0,348.0,345.0,342.0,339.0,336.0,333.0,5.0,5.0,7.0,3.0,6.0,5.0,7.0,2.0,5.0,3.0,5.0,3.0,6.0,3.0,4.0,1.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,4.0,2.0,3.0,2.0,1.0,2.0,2.0,3.0,1.0,3.0,2.0,35.0,0.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,20.0,5.0,5.0,1.0,1.0,2.0,2.0,40.0,6.0,5.0,3.0,4.0,4.0,2.0,6.0,2.0,7.0,7.0,7.0,2.0,30.0,1.0,73.0,20.0,7.0,4.0,92,1,0,0,0,3.0,0.25,0.0,0,0,0,0,1,1.0,1.0,3.0,15.0,160,500.0,30.0,1,0,1,0,1,0.125,240.000001,6.0,30.0,21,23,25,24,26,22,34.0,36.0,35.0,19,9,10,11,20.0,29.0,16,17,33.0,31.0,37.0,27.0,32.0,0,28.0,5.0,2,4.0,3,14,13,1,9,2,14,31,33,31.0,33.0,0,25,0.086957,0.086957,5.5,6.5,4.0,5.0,3.0,2.666667,6.0,3.5,5.0,5.0,5.0,2.25,1.0,0.0,0.0,0.0,0.0,1.0,4.2,0.31746,1,0.0


In [186]:
rp2 = []

synthetic_df = df.copy()
synthetic_df = synthetic_df.sample(frac=1)
synthetic_df = synthetic_df.iloc[:int(synthetic_df.shape[0]/4),]
synthetic_df = synthetic_df.mask(np.random.random(synthetic_df.shape) < .1)
synthetic_df = synthetic_df.add(np.random.uniform())

#remaining_features = ['rratio', 'Temperatureinlab', 'Pool4', 'Persistence', 'Pool19b', 'DaysInComp', 'DaysInLab', 'Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism', 'Intrinsic', 'Mood', 'NFC', 'Stress', 'ArgumentQuality', 'NFCcenter']
count =0
feature_list = list(df)
for target in feature_list:
    try:
        imp_feature_list = ['Participant_ID', 'RowNumber', 'session_id', 'age', 'backcount1', 'backcount10', 'backcount2', 'backcount3', 'backcount4', 'backcount5', 'backcount6', 'backcount7', 'backcount8', 'backcount9', 'big5_01', 'big5_02', 'big5_03', 'big5_04', 'big5_05', 'big5_06', 'big5_07', 'big5_08', 'big5_09', 'big5_10', 'elm_01', 'elm_02', 'elm_03', 'elm_04', 'elm_05', 'gender', 'intrinsic_01', 'intrinsic_02', 'intrinsic_03', 'intrinsic_04', 'intrinsic_05', 'intrinsic_06', 'intrinsic_07', 'intrinsic_08', 'intrinsic_09', 'intrinsic_10', 'intrinsic_11', 'intrinsic_12', 'intrinsic_13', 'intrinsic_14', 'intrinsic_15', 'kposition', 'kratio', 'lposition', 'lratio', 'mcdv1', 'mcdv2', 'mcfiller1', 'mcfiller2', 'mcfiller3', 'mood_01', 'mood_02', 'nfc_01', 'nfc_02', 'nfc_03', 'nfc_04', 'nfc_05', 'nfc_06', 'nposition', 'nratio', 'pate_01', 'pate_02', 'pate_03', 'pate_04', 'pate_05', 'rposition', 'rratio', 'sarcasm', 'selfesteem_01', 'stress_01', 'stress_02', 'stress_03', 'stress_04', 'tempest2', 'tempest3', 'tempfollowup1', 'tempfollowup2', 'tempfollowup3', 'vposition', 'vratio', 'year', 'Temperatureinlab', 'ClipboardWeight', 'IIResponse', 'SRConfidenceResponse', 'NumberofDays', 'Pool2a', 'Pool2b', 'Pool2c', 'Pool2d', 'Pool3', 'Pool4', 'Pool5a', 'Pool6', 'Pool7b', 'Pool7c', 'Pool7d', 'Pool8', 'Pool9', 'Pool10', 'Pool11', 'Pool12', 'Pool13', 'Pool14', 'Pool15', 'Pool16a', 'Pool16b', 'Pool17', 'Pool18', 'Pool19a', 'Pool19b', 'Persistence', 'anagrams_order', 'attention_order', 'availinstruct_order', 'availk_order', 'availl_order', 'availn_order', 'availr_order', 'availv_order', 'bigfive_order']
        
        if target not in imp_feature_list:
            features = imp_feature_list
        else:
            imp_feature_list.remove(target)
            features = imp_feature_list
 
        #synthetic_X = np.nan_to_num(synthetic_df.iloc[:, synthetic_df.columns != target].values).astype(int) 
        #X = np.nan_to_num(df.iloc[:, df.columns != target].values).astype(int)
        #y = np.nan_to_num(df.iloc[:, df.columns == target].values).astype(int)
                
        X = np.nan_to_num(df[features].values).astype(int)
        y = np.nan_to_num(df[target].values).astype(int)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
        sc = StandardScaler()  
        n_folds = 5
        max_depth = 10
        min_size = 1
        sample_size = 1.0
        n_features = int(sqrt(len(dataset[0])-1))


        for n_trees in [1, 5, 10]:
        #for n_trees in [1]:
            scores,trees = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
            print('Trees: %d' % n_trees)
            print('Scores: %s' % scores)
            print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores)))) 
        y_pred = y_pred.astype(int)
        if accuracy_score(y_test, y_pred) > 0.0:
            rp2.append(str(accuracy_score(y_test, y_pred)))
            print(target,': ',accuracy_score(y_test, y_pred)) 
            #na = np.where(np.isnan(df[target]))[0]
            #for idx in na:
            #    X = np.nan_to_num(df.iloc[idx, df.columns != target].values).astype(int)
            #    df.iloc[idx, df.columns == target] = regressor.predict(X.reshape(1, -1)).astype(int)
            #synthetic_na = np.where(np.isnan(synthetic_df[target]))[0]
            #for idx in synthetic_na:
            #    synthetic_X = np.nan_to_num(synthetic_df.iloc[idx, synthetic_df.columns != target].values).astype(int)  
            #    synthetic_df.iloc[idx, synthetic_df.columns == target] = regressor.predict(synthetic_X.reshape(1, -1)).astype(int)
            count += 1
        
    except Exception as e:
        print(str(e))
        pass
    
print(count)

Participant_ID :  0.14613778705636743
RowNumber :  0.13569937369519833
session_id :  0.0041753653444676405
age :  0.534446764091858
backcount10 :  0.5386221294363257
backcount2 :  0.824634655532359
backcount3 :  0.8643006263048016
backcount4 :  0.8475991649269311
backcount5 :  0.824634655532359
backcount6 :  0.872651356993737
backcount7 :  0.8100208768267223
backcount8 :  0.860125260960334
backcount9 :  0.8851774530271399
big5_01 :  0.1941544885177453
big5_02 :  0.20250521920668058
big5_03 :  0.267223382045929
big5_04 :  0.21920668058455114
big5_05 :  0.2985386221294363
big5_06 :  0.18997912317327767
big5_07 :  0.2818371607515658
big5_08 :  0.27348643006263046
big5_09 :  0.2776617954070981
big5_10 :  0.24843423799582465
elm_01 :  0.27348643006263046
elm_02 :  0.29018789144050106
elm_03 :  0.3173277661795407
elm_04 :  0.3068893528183716
elm_05 :  0.3068893528183716
gender :  0.7139874739039666
intrinsic_01 :  0.534446764091858
intrinsic_02 :  0.5281837160751566
intrinsic_03 :  0.5219206

['0.8810020876826722',
 '0.2881002087682672',
 '0.14822546972860126',
 '0.2839248434237996',
 '0.2150313152400835',
 '0.2881002087682672',
 '0.23173277661795408',
 '0.24843423799582465',
 '0.23382045929018788',
 '0.21711899791231734',
 '0.17118997912317327',
 '0.162839248434238',
 '0.2045929018789144',
 '0.20250521920668058',
 '0.7139874739039666',
 '0.4989561586638831',
 '0.4488517745302714',
 '0.3966597077244259',
 '0.3966597077244259',
 '0.35908141962421714',
 '0.4196242171189979',
 '0.42588726513569936',
 '0.407098121085595',
 '0.36116910229645094',
 '0.3465553235908142',
 '0.36116910229645094',
 '0.35908141962421714',
 '0.3778705636743215',
 '0.302713987473904',
 '0.36116910229645094',
 '0.534446764091858',
 '0.05219206680584551',
 '0.5219206680584552',
 '0.05636743215031315',
 '0.6764091858037579',
 '0.18162839248434237',
 '0.5365344467640919',
 '0.534446764091858',
 '0.6597077244258872',
 '0.302713987473904',
 '0.32150313152400833',
 '0.31315240083507306',
 '0.4112734864300626',

In [161]:
synthetic_df.head(10)

Unnamed: 0,Participant_ID,RowNumber,session_id,age,backcount1,backcount10,backcount2,backcount3,backcount4,backcount5,backcount6,backcount7,backcount8,backcount9,big5_01,big5_02,big5_03,big5_04,big5_05,big5_06,big5_07,big5_08,big5_09,big5_10,elm_01,elm_02,elm_03,elm_04,elm_05,gender,intrinsic_01,intrinsic_02,intrinsic_03,intrinsic_04,intrinsic_05,intrinsic_06,intrinsic_07,intrinsic_08,intrinsic_09,intrinsic_10,intrinsic_11,intrinsic_12,intrinsic_13,intrinsic_14,intrinsic_15,kposition,kratio,lposition,lratio,mcdv1,mcdv2,mcfiller1,mcfiller2,mcfiller3,mood_01,mood_02,nfc_01,nfc_02,nfc_03,nfc_04,nfc_05,nfc_06,nposition,nratio,pate_01,pate_02,pate_03,pate_04,pate_05,rposition,rratio,sarcasm,selfesteem_01,stress_01,stress_02,stress_03,stress_04,tempest2,tempest3,tempfollowup1,tempfollowup2,tempfollowup3,vposition,vratio,year,Temperatureinlab,ClipboardWeight,IIResponse,SRConfidenceResponse,NumberofDays,Pool2a,Pool2b,Pool2c,Pool2d,Pool3,Pool4,Pool5a,Pool6,Pool7b,Pool7c,Pool7d,Pool8,Pool9,Pool10,Pool11,Pool12,Pool13,Pool14,Pool15,Pool16a,Pool16b,Pool17,Pool18,Pool19a,Pool19b,Persistence,anagrams_order,attention_order,availinstruct_order,availk_order,availl_order,availn_order,availr_order,availv_order,bigfive_order,debrief_order,demographics_order,elmques_order,filler1_order,filler2_order,galinskyvignette_order,inlab_order,intrinsic_order,mcfiller_order,moninvignette_order,mood_order,nfc_order,participantid_order,participation_order,selfesteem_order,startpage_order,stress_order,stroop_order,stroopinstructions_order,stroopinstructionstest_order,stroopprac_order,tempestimate_order,tempfollowup_order,welcome_order,MonthComputer,DayComputer,YearComputer,DaysSinceMonthComputer,DaysSinceAugComputer,DaysSinceMonthLab,DaysSinceAugLab,DaysSinceMonthStart,DaysSinceAugStart,DaysInComp,DaysInLab,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,Intrinsic,Mood,NFC,ReportedAttention,ReportedEffort,SelfEsteem,Stress,K1st,L1st,N1st,R1st,V1st,AvailFirst,ArgumentQuality,NFCcenter,ELMCond,CBReject
945,8.897923,9.897923,681152.9,18.897923,357.8979,330.897923,354.897923,351.897923,348.897923,,,339.897923,,333.897923,5.897923,4.897923,6.897923,3.897923,6.897923,5.897923,6.897923,3.897923,,4.897923,5.897923,6.897923,6.897923,9.897923,6.897923,1.897923,2.897923,1.897923,1.897923,2.897923,2.897923,4.897923,,,4.897923,3.897923,3.897923,2.897923,4.897923,2.897923,4.897923,1.897923,5.897923,1.897923,5.897923,0.897923,3.897923,2.897923,0.897923,,6.897923,5.897923,1.897923,3.897923,4.897923,2.897923,2.897923,3.897923,2.897923,20.897923,4.897923,5.897923,1.897923,1.897923,4.897923,1.897923,5.897923,1.897923,5.897923,3.897923,5.897923,1.897923,,6.897923,3.897923,7.897923,6.897923,1.897923,2.897923,15.897923,1.897923,72.897923,20.897923,7.897923,5.897923,,1.897923,0.897923,,0.897923,5.897923,1.397923,0.897923,0.897923,0.897923,0.897923,0.897923,0.897923,1.897923,1.897923,2.897923,25.897923,150.897923,750.897923,94.897923,1.897923,0.897923,3.897923,0.897923,0.897923,,134.897923,6.897923,28.897923,11.897923,16.897923,15.897923,12.897923,13.897923,14.897923,33.897923,36.897923,35.897923,26.897923,18.897923,19.897923,20.897923,7.897923,29.897923,9.897923,10.897923,31.897923,32.897923,37.897923,34.897923,,0.897923,,5.897923,2.897923,4.897923,3.897923,24.897923,23.897923,1.897923,11.897923,12.897923,14.897923,92.897923,104.897923,92.897923,104.897923,,27.897923,1.638307,1.638307,5.897923,6.397923,4.897923,5.897923,3.397923,3.164589,3.397923,3.064589,4.897923,5.897923,5.897923,,,1.897923,0.897923,1.897923,0.897923,3.897923,7.297923,-0.11795,-0.102077,0.897923
1460,79.897923,1376.897923,7507840.0,19.897923,357.8979,330.897923,354.897923,351.897923,348.897923,345.897923,,339.897923,336.897923,333.897923,5.897923,4.897923,6.897923,3.897923,5.897923,6.897923,6.897923,2.897923,6.897923,6.897923,7.897923,8.897923,8.897923,7.897923,7.897923,1.897923,,2.897923,,2.897923,,2.897923,2.897923,3.897923,,,3.897923,3.897923,3.897923,,3.897923,1.897923,1.897923,1.897923,,1.897923,1.897923,1.897923,0.897923,2.897923,1.897923,1.897923,4.897923,4.897923,2.897923,4.897923,4.897923,4.897923,1.897923,3.897923,5.897923,4.897923,,1.897923,5.897923,2.897923,7.897923,2.897923,4.897923,,3.897923,3.897923,4.897923,6.897923,2.897923,6.897923,,5.897923,1.897923,2.897923,,,10.897923,,5.897923,85.897923,,0.897923,0.897923,0.897923,7.897923,1.397923,0.897923,0.897923,1.897923,0.897923,0.897923,0.897923,1.897923,1.897923,2.897923,84.897923,2842.897923,19898.897923,1000.897923,1.897923,0.897923,1.897923,1.897923,,0.997923,228.897915,13.897923,34.897923,21.897923,25.897923,22.897923,26.897923,24.897923,23.897923,32.897923,,35.897923,3.897923,18.897923,19.897923,20.897923,4.897923,27.897923,10.897923,11.897923,31.897923,30.897923,,33.897923,29.897923,0.897923,28.897923,8.897923,5.897923,7.897923,6.897923,16.897923,15.897923,1.897923,10.897923,23.897923,,61.897923,,61.897923,84.897923,31.897923,46.897923,1.344981,1.344981,4.397923,6.897923,4.397923,5.897923,3.397923,3.564589,7.897923,,5.897923,4.897923,4.897923,,1.897923,1.897923,1.897923,0.897923,1.897923,4.897923,8.297923,,1.897923,0.897923
2096,60.897923,2101.897923,7581403.0,18.897923,357.8979,330.897923,354.897923,351.897923,348.897923,345.897923,342.897923,339.897923,336.897923,333.897923,3.897923,6.897923,6.897923,5.897923,5.897923,,3.897923,1.897923,6.897923,5.897923,,5.897923,6.897923,7.897923,5.897923,2.897923,1.897923,,1.897923,4.897923,2.897923,3.897923,1.897923,4.897923,3.897923,2.897923,2.897923,1.897923,4.897923,2.897923,3.897923,1.897923,6.897923,2.897923,7.897923,0.897923,3.897923,1.897923,0.897923,3.897923,4.897923,,4.897923,2.897923,3.897923,4.897923,5.897923,2.897923,2.897923,6.897923,3.897923,4.897923,1.897923,1.897923,4.897923,1.897923,7.897923,,3.897923,4.897923,4.897923,3.897923,3.897923,4.897923,3.897923,6.897923,6.897923,5.897923,2.897923,8.897923,1.897923,25.497923,20.897923,7.897923,4.897923,71.897923,,0.897923,0.897923,0.897923,4.897923,,,0.897923,0.897923,0.897923,0.897923,1.897923,1.897923,1.897923,3.897923,,1300.897923,5000.897923,30.897923,1.897923,0.897923,1.897923,,0.897923,,70.897924,24.897923,34.897923,14.897923,17.897923,16.897923,15.897923,19.897923,18.897923,29.897923,36.897923,35.897923,26.897923,3.897923,,5.897923,20.897923,33.897923,12.897923,13.897923,31.897923,32.897923,37.897923,27.897923,,0.897923,,10.897923,,9.897923,8.897923,23.897923,22.897923,1.897923,11.897923,25.897923,14.897923,92.897923,117.897923,92.897923,117.897923,,53.897923,1.799331,1.799331,4.897923,7.397923,4.897923,3.397923,4.397923,3.364589,,4.897923,3.897923,4.897923,3.897923,3.897923,1.897923,0.897923,0.897923,1.897923,0.897923,2.897923,6.697923,1.715383,-0.102077,0.897923
2120,,2228.897923,7603845.0,19.897923,357.8979,330.897923,354.897923,351.897923,348.897923,345.897923,342.897923,339.897923,336.897923,333.897923,4.897923,6.897923,2.897923,6.897923,,5.897923,4.897923,7.897923,2.897923,2.897923,5.897923,5.897923,6.897923,,5.897923,,4.897923,4.897923,4.897923,4.897923,4.897923,4.897923,2.897923,2.897923,,3.897923,3.897923,3.897923,3.897923,3.897923,3.897923,,7.897923,1.897923,5.897923,1.897923,0.897923,3.897923,2.897923,1.897923,4.897923,4.897923,3.897923,3.897923,3.897923,3.897923,3.897923,3.897923,2.897923,,4.897923,4.897923,2.897923,1.897923,5.897923,2.897923,5.897923,3.897923,7.897923,4.897923,5.897923,2.897923,5.897923,6.897923,2.897923,5.897923,5.897923,5.897923,2.897923,4.897923,1.897923,,,,,71.897923,1.897923,0.897923,0.897923,0.897923,4.897923,1.897923,0.897923,0.897923,0.897923,0.897923,0.897923,1.897923,1.897923,1.897923,3.897923,,1300.897923,5000.897923,30.897923,1.897923,0.897923,1.897923,0.897923,0.897923,,41.89792,16.897923,30.897923,21.897923,25.897923,24.897923,26.897923,22.897923,23.897923,31.897923,,35.897923,9.897923,12.897923,13.897923,14.897923,10.897923,28.897923,6.897923,7.897923,27.897923,32.897923,,29.897923,34.897923,0.897923,33.897923,20.897923,17.897923,19.897923,18.897923,,3.897923,1.897923,,2.897923,14.897923,122.897923,124.897923,,,31.897923,53.897923,1.897923,,6.397923,2.397923,,,,4.031256,4.897923,3.897923,4.897923,4.897923,7.897923,,0.897923,,0.897923,0.897923,,1.897923,6.297923,0.715383,-0.102077,
1940,,962.897923,7474666.0,,359.8979,350.897923,358.897923,357.897923,356.897923,355.897923,,353.897923,352.897923,351.897923,6.897923,2.897923,7.897923,1.897923,6.897923,1.897923,6.897923,1.897923,,5.897923,6.897923,4.897923,5.897923,2.897923,4.897923,1.897923,,2.897923,2.897923,4.897923,4.897923,,2.897923,4.897923,3.897923,4.897923,2.897923,2.897923,3.897923,3.897923,4.897923,1.897923,2.897923,1.897923,,0.897923,2.897923,2.897923,1.897923,1.897923,3.897923,2.897923,,2.897923,4.897923,3.897923,4.897923,3.897923,2.897923,15.897923,5.897923,,1.897923,1.897923,5.897923,2.897923,20.897923,2.897923,6.897923,2.897923,4.897923,4.897923,2.897923,,4.897923,7.897923,7.897923,5.897923,1.897923,2.897923,1.897923,73.897923,20.897923,5.897923,3.897923,107.897923,1.897923,0.897923,,0.897923,4.897923,1.397923,0.897923,0.897923,,0.897923,0.897923,0.897923,1.897923,1.897923,2.897923,26.897923,600.897923,,140.897923,,0.897923,1.897923,1.897923,1.897923,1.227923,113.897916,10.897923,31.897923,13.897923,17.897923,15.897923,,14.897923,16.897923,28.897923,36.897923,35.897923,,6.897923,7.897923,8.897923,22.897923,29.897923,20.897923,21.897923,33.897923,34.897923,37.897923,27.897923,32.897923,0.897923,30.897923,26.897923,23.897923,25.897923,24.897923,4.897923,3.897923,1.897923,10.897923,9.897923,14.897923,61.897923,70.897923,61.897923,70.897923,0.897923,27.897923,1.299792,1.299792,5.397923,7.897923,7.397923,6.897923,2.897923,,6.397923,3.897923,5.897923,,6.897923,2.897923,1.897923,,0.897923,,1.897923,3.897923,5.097923,0.715383,,0.897923
1034,75.897923,1864.897923,7557744.0,19.897923,357.8979,330.897923,354.897923,351.897923,,345.897923,342.897923,339.897923,336.897923,333.897923,6.897923,2.897923,,1.897923,6.897923,4.897923,6.897923,1.897923,5.897923,1.897923,,8.897923,9.897923,8.897923,7.897923,1.897923,3.897923,3.897923,2.897923,3.897923,4.897923,3.897923,3.897923,4.897923,,4.897923,4.897923,4.897923,4.897923,,4.897923,1.897923,4.897923,1.897923,5.897923,,,1.897923,1.897923,1.897923,,5.897923,4.897923,2.897923,4.897923,3.897923,4.897923,2.897923,2.897923,16.897923,4.897923,4.897923,1.897923,1.897923,4.897923,2.897923,,5.897923,,4.897923,5.897923,3.897923,5.897923,,,7.897923,7.897923,5.897923,1.897923,3.897923,3.897923,75.897923,,7.897923,5.897923,88.897923,1.897923,1.897923,0.897923,,5.897923,1.397923,0.897923,0.897923,,1.897923,0.897923,0.897923,0.897923,1.897923,2.897923,18.897923,,750.897923,105.897923,1.897923,0.897923,3.897923,1.897923,0.897923,,136.897925,26.897923,30.897923,,16.897923,13.897923,17.897923,15.897923,14.897923,28.897923,36.897923,35.897923,11.897923,23.897923,24.897923,25.897923,2.897923,,20.897923,21.897923,29.897923,,37.897923,33.897923,27.897923,0.897923,31.897923,9.897923,,8.897923,7.897923,5.897923,,1.897923,11.897923,13.897923,14.897923,,105.897923,92.897923,,31.897923,46.897923,1.568377,,7.397923,6.897923,5.897923,6.897923,2.897923,4.164589,3.897923,4.397923,4.897923,4.897923,6.897923,4.147923,1.897923,,,0.897923,1.897923,3.897923,8.697923,1.215383,-0.102077,0.897923
916,,53.897923,681916.9,18.897923,2.97e+84,,,,,,,,,,5.897923,,4.897923,3.897923,4.897923,3.897923,2.897923,2.897923,7.897923,2.897923,7.897923,,6.897923,8.897923,7.897923,2.897923,1.897923,2.897923,1.897923,,,2.897923,3.897923,4.897923,4.897923,3.897923,4.897923,2.897923,3.897923,3.897923,4.897923,,,,8.897923,0.897923,1.897923,2.897923,1.897923,1.897923,4.897923,3.897923,1.897923,3.897923,4.897923,3.897923,5.897923,4.897923,1.897923,5.897923,3.897923,3.897923,2.897923,,5.897923,1.897923,3.897923,1.897923,5.897923,3.897923,2.897923,2.897923,4.897923,7.897923,3.897923,7.897923,6.897923,4.897923,2.897923,7.897923,1.897923,71.897923,10.897923,6.897923,5.897923,104.897923,1.897923,0.897923,0.897923,0.897923,5.897923,1.397923,0.897923,0.897923,0.897923,0.897923,,0.897923,1.897923,1.897923,2.897923,25.897923,150.897923,750.897923,94.897923,1.897923,0.897923,3.897923,0.897923,,,233.897923,3.897923,,7.897923,8.897923,10.897923,12.897923,11.897923,,,36.897923,35.897923,26.897923,19.897923,20.897923,21.897923,17.897923,31.897923,5.897923,6.897923,32.897923,34.897923,37.897923,28.897923,29.897923,,27.897923,16.897923,13.897923,15.897923,,24.897923,23.897923,1.897923,11.897923,,14.897923,92.897923,,92.897923,111.897923,0.897923,27.897923,1.705615,,5.897923,5.897923,5.897923,3.397923,2.897923,3.764589,5.397923,3.564589,3.897923,3.897923,5.897923,4.647923,1.897923,1.897923,1.897923,1.897923,0.897923,4.897923,7.697923,0.38205,1.897923,0.897923
1877,133.897923,2388.897923,7612850.0,,357.8979,330.897923,354.897923,351.897923,348.897923,345.897923,,339.897923,336.897923,333.897923,,2.897923,6.897923,2.897923,6.897923,2.897923,6.897923,2.897923,6.897923,2.897923,7.897923,6.897923,7.897923,7.897923,5.897923,1.897923,2.897923,2.897923,2.897923,3.897923,3.897923,,2.897923,3.897923,3.897923,3.897923,3.897923,2.897923,3.897923,3.897923,3.897923,2.897923,20.897923,1.897923,5.897923,0.897923,0.897923,1.897923,0.897923,2.897923,,2.897923,2.897923,4.897923,2.897923,3.897923,3.897923,2.897923,2.897923,30.897923,4.897923,4.897923,1.897923,1.897923,3.897923,2.897923,40.897923,2.897923,5.897923,4.897923,4.897923,3.897923,3.897923,6.897923,2.897923,6.897923,6.897923,2.897923,2.897923,50.897923,1.897923,75.897923,10.897923,6.897923,5.897923,107.897923,1.897923,0.897923,0.897923,0.897923,4.897923,1.397923,0.897923,0.897923,0.897923,0.897923,0.897923,0.897923,1.897923,1.897923,2.897923,26.897923,600.897923,2500.897923,140.897923,1.897923,0.897923,1.897923,1.897923,1.897923,1.227923,241.897927,20.897923,34.897923,13.897923,,15.897923,14.897923,16.897923,17.897923,30.897923,36.897923,35.897923,8.897923,10.897923,,12.897923,19.897923,28.897923,25.897923,26.897923,32.897923,33.897923,37.897923,27.897923,31.897923,0.897923,29.897923,,2.897923,4.897923,3.897923,23.897923,22.897923,1.897923,12.897923,4.897923,14.897923,122.897923,126.897923,122.897923,126.897923,0.897923,27.897923,1.823156,1.823156,6.897923,6.897923,6.897923,,2.897923,3.631256,6.897923,,4.897923,4.897923,5.897923,3.897923,0.897923,1.897923,0.897923,0.897923,0.897923,1.897923,7.297923,0.715383,-0.102077,
1922,52.897923,602.897923,7437797.0,18.897923,357.8979,331.897923,354.897923,351.897923,349.897923,346.897923,343.897923,340.897923,337.897923,334.897923,5.897923,2.897923,7.897923,2.897923,7.897923,1.897923,7.897923,1.897923,5.897923,2.897923,7.897923,4.897923,6.897923,7.897923,,1.897923,,2.897923,2.897923,3.897923,2.897923,3.897923,3.897923,2.897923,4.897923,4.897923,4.897923,4.897923,4.897923,4.897923,4.897923,2.897923,,1.897923,2.897923,2.897923,2.897923,1.897923,0.897923,1.897923,2.897923,3.897923,1.897923,2.897923,3.897923,,2.897923,4.897923,2.897923,6.897923,4.897923,5.897923,1.897923,1.897923,1.897923,1.897923,4.897923,5.897923,5.897923,,4.897923,4.897923,3.897923,5.897923,3.897923,7.897923,6.897923,5.897923,2.897923,3.897923,,72.897923,20.897923,6.897923,,107.897923,1.897923,,0.897923,0.897923,4.897923,1.397923,0.897923,0.897923,0.897923,0.897923,0.897923,0.897923,1.897923,1.897923,2.897923,26.897923,,2500.897923,140.897923,1.897923,,1.897923,1.897923,1.897923,1.227923,87.897919,16.897923,31.897923,10.897923,15.897923,13.897923,14.897923,11.897923,12.897923,34.897923,36.897923,35.897923,22.897923,18.897923,19.897923,20.897923,,29.897923,3.897923,4.897923,30.897923,32.897923,37.897923,33.897923,28.897923,0.897923,27.897923,26.897923,23.897923,25.897923,24.897923,7.897923,6.897923,1.897923,9.897923,23.897923,14.897923,31.897923,54.897923,31.897923,54.897923,0.897923,27.897923,1.150259,1.150259,7.397923,7.897923,6.897923,7.397923,3.397923,3.964589,6.397923,3.231256,4.897923,5.897923,5.897923,2.897923,0.897923,1.897923,0.897923,1.897923,0.897923,2.897923,7.297923,0.048716,1.897923,0.897923
1487,,1401.897923,7510631.0,21.897923,357.8979,330.897923,354.897923,,,345.897923,342.897923,339.897923,336.897923,,3.897923,1.897923,5.897923,1.897923,,5.897923,6.897923,3.897923,7.897923,3.897923,8.897923,5.897923,7.897923,7.897923,8.897923,1.897923,2.897923,2.897923,2.897923,2.897923,2.897923,,2.897923,3.897923,3.897923,4.897923,3.897923,2.897923,3.897923,,4.897923,2.897923,,1.897923,6.897923,0.897923,2.897923,,0.897923,2.897923,3.897923,2.897923,2.897923,2.897923,2.897923,4.897923,2.897923,2.897923,2.897923,25.897923,4.897923,5.897923,1.897923,1.897923,4.897923,1.897923,,1.897923,5.897923,3.897923,4.897923,3.897923,4.897923,5.897923,3.897923,,6.897923,5.897923,2.897923,7.897923,3.897923,,,,,85.897923,1.897923,0.897923,0.897923,0.897923,7.897923,1.397923,,0.897923,1.897923,0.897923,0.897923,0.897923,1.897923,1.897923,2.897923,84.897923,,19898.897923,1000.897923,1.897923,0.897923,1.897923,1.897923,1.897923,0.997923,168.897927,5.897923,31.897923,11.897923,15.897923,13.897923,12.897923,16.897923,14.897923,27.897923,36.897923,35.897923,,18.897923,,20.897923,,28.897923,23.897923,24.897923,33.897923,34.897923,37.897923,30.897923,29.897923,0.897923,32.897923,9.897923,6.897923,8.897923,7.897923,,3.897923,1.897923,10.897923,24.897923,14.897923,61.897923,,,,31.897923,46.897923,1.356746,,,5.897923,,7.397923,1.897923,3.697923,6.397923,4.231256,4.897923,5.897923,5.897923,3.897923,0.897923,1.897923,0.897923,1.897923,0.897923,2.897923,7.897923,1.048716,-0.102077,


In [119]:
synthetic_df.to_csv('data/ml3_generate_data_approach_2.csv', index=False)

In [102]:
df.to_csv('ml3_sci_imputation_random_forest_2.csv', index=False)

In [71]:
len(remaining_features)

19

In [51]:
s = []
for target in list(df):
    try:
        X = np.nan_to_num(df.iloc[:, df.columns != target].values)
        y = np.nan_to_num(df.iloc[:, df.columns == target].values)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
        svm = SVM() # Linear Kernel
        data_dict = {}
        svm.fit(data=data_dict)
        svm.visualize() 
        y_pred = model.predict(X_test)  
        y_pred = y_pred.astype(int)
        if accuracy_score(y_test, y_pred) > 0.5:
            count += 1
            print(target,': ',accuracy_score(y_test, y_pred))
        s.append(str(accuracy_score(y_test, y_pred)))
    except:
        s.append('')
        print(target)
        pass

age
backcount1
backcount10
backcount2 :  0.9206680584551148
backcount3
backcount4
backcount5 :  0.860125260960334
backcount6
backcount7
backcount8
backcount9
gender :  0.7682672233820459
intrinsic_01 :  0.6471816283924844
intrinsic_02 :  0.6450939457202505
intrinsic_03 :  0.5532359081419624
intrinsic_04 :  0.5824634655532359
intrinsic_05 :  0.5198329853862212
intrinsic_06 :  0.5052192066805845
intrinsic_07 :  0.5365344467640919
intrinsic_08 :  0.6033402922755741
intrinsic_10 :  0.6805845511482255
intrinsic_11 :  0.5135699373695198
intrinsic_13 :  0.6096033402922756
intrinsic_15 :  0.6430062630480167
kposition :  0.964509394572025
kratio
lposition :  0.9686847599164927
lratio
mcdv1 :  0.6931106471816284
mcfiller1 :  0.5991649269311065
mcfiller2 :  0.605427974947808
mcfiller3 :  0.7139874739039666
mood_01 :  0.5762004175365344
mood_02 :  0.534446764091858
nfc_02 :  0.5365344467640919
nfc_03 :  0.5219206680584552
nfc_04 :  0.5260960334029228
nposition :  0.9561586638830898
nratio
pate_01 

In [122]:
#data = {'RandomTrees':r, 'RandomTrees with prediction': rp,'SVM':s, 'Ridge':rr, 'Lasso':l} 
data = {'Approach 1':rp, 'Approach 2': rp2}
  
# Create DataFrame 
d = pd.DataFrame(data) 
  
# Print the output. 
#d

In [123]:
f = d.copy()
f.index = list(df)
#f['RandomTrees With Back Prediction'] = rp
#f

In [124]:
f

Unnamed: 0,Approach 1,Approach 2
Participant_ID,0.008333333333333333,0.0
RowNumber,0.008333333333333333,0.0
session_id,0.0,0.0
age,0.18333333333333332,0.09166666666666666
backcount1,0.06666666666666667,0.06666666666666667
backcount10,0.058333333333333334,0.058333333333333334
backcount2,0.06666666666666667,0.2
backcount3,0.15,0.13333333333333333
backcount4,0.21666666666666667,0.08333333333333333
backcount5,0.175,0.10833333333333334


In [125]:
f.to_csv('synthetic_data_goodness.csv')

In [126]:
val =  pd.read_csv('data/model_validations.csv', encoding = 'ISO-8859-1')
val2 = pd.read_csv('data/synthetic_data_goodness.csv', encoding = 'ISO-8859-1')

In [132]:
val2['per1'] = val2['Approach 1']/val2['Original']
val2['per2'] = val2['Approach 2']/val2['Original']

In [133]:
val2

Unnamed: 0.1,Unnamed: 0,Approach 1,Approach 2,Original,per1,per2
0,Participant_ID,0.008333,0.000000,0.131524,0.063360,0.000000
1,RowNumber,0.008333,0.000000,0.146138,0.057024,0.000000
2,session_id,0.000000,0.000000,0.008351,0.000000,0.000000
3,age,0.183333,0.091667,0.565762,0.324047,0.162023
4,backcount1,0.066667,0.066667,0.000000,inf,inf
5,backcount10,0.058333,0.058333,,,
6,backcount2,0.066667,0.200000,,,
7,backcount3,0.150000,0.133333,,,
8,backcount4,0.216667,0.083333,,,
9,backcount5,0.175000,0.108333,,,


In [199]:
df.shape

(2391, 182)