In [None]:
def get_clean_dataset(dataset,category,target_binary=1):
    dataset = dataset.dropna()
    if target_binary:
        dataset["difficulty"].replace(regex={"hard": 1, "easy": 0},
                                      inplace=True)
    clean_dataset = dataset[dataset["category"] == category]
    target = clean_dataset["difficulty"]
    clean_dataset = clean_dataset.drop(labels=["participant_id","difficulty","category","order","turn"], 
                                       axis=1).reset_index(drop=True)
    return clean_dataset,target

In [145]:
def accuracy_print(X_test,y_test,y_pred):
    print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))
    print(accuracy_score(y_test,y_pred))

In [140]:
def naive_bayes(X_train,y_train,X_test):
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    return y_pred

In [160]:
def desicion_tree(X_train,y_train,X_test,grid_search=1):
    dtc = DecisionTreeClassifier() # Create Decision Tree classifer object
    if grid_search:
        parameters = {"criterion":("gini", "entropy"),
                  "splitter":("best","random"),
                  "max_depth":[None,5,10,15,20,25,30,35,40],
                  "max_features":("auto","sqrt","log2"),
                  "random_state":[None,0,10,100]}
        clf = GridSearchCV(dtc, parameters)
        y_pred = clf.fit(X_train, y_train).predict(X_test) # Train Decision Tree Classifer and Predict the response
        return y_pred
    y_pred = dtc.fit(X_train, y_train).predict(X_test)
    return y_pred

In [158]:
def svm_class(X_train,y_train,X_test,grid_search=1):
    svc = SVC()
    if grid_search:
        parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 
                  'C':[1, 10],
                  'degree':[1,2,3,4],
                  'gamma':('scale','auto'),
                  'shrinking':[1,0],
                  'decision_function_shape':("ovo","ovr")}
        clf = GridSearchCV(svc, parameters)
        y_pred = clf.fit(X_train, y_train).predict(X_test)
        return y_pred
    
    y_pred = svc.fit(X_train, y_train).predict(X_test)
    return y_pred

## MODELS

In [1]:
import pandas as pd

In [4]:
path = "./../archivos/"
filename = "dataset.pkl"

In [67]:
dataset = pd.read_pickle(path+filename)

In [169]:
numeric,target_numeric = get_clean_dataset(dataset,"numeric")

In [170]:
sequence, target_sequence = get_clean_dataset(dataset,"sequence")

In [182]:
verbal,target_verbal = get_clean_dataset(dataset,"verbal")

In [133]:
from sklearn.model_selection import train_test_split

In [203]:
size_test = 0.2
#verbal
X_train_v, X_test_v, y_train_v, y_test_v = train_test_split(verbal, target_verbal, test_size=size_test, random_state=0,
                                                           stratify=target_verbal)
#sequence
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(sequence, target_sequence, test_size=size_test, random_state=0,
                                                           stratify=target_sequence)
#numeric
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(numeric, target_numeric, test_size=size_test, random_state=0,
                                                           stratify=target_numeric)

In [89]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

## Verbal

#### Naive Bayes

In [136]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [204]:
y_pred_naive_v = naive_bayes(X_train_v,y_train_v,X_test_v)
accuracy_print(X_test_v,y_test_v,y_pred_naive_v)

Number of mislabeled points out of a total 2 points : 1
0.5


#### SVM

In [102]:
from sklearn.svm import SVC

In [205]:
y_pred_svm_v = svm_class(X_train_v,y_train_v,X_test_v)
accuracy_print(X_test_v,y_test_v,y_pred_svm_v)



Number of mislabeled points out of a total 2 points : 1
0.5


#### Desicion Tree

In [114]:
from sklearn.tree import DecisionTreeClassifier

In [207]:
y_pred_dt_v = desicion_tree(X_train_v,y_train_v,X_test_v,grid_search=0)
accuracy_print(X_test_v,y_test_v,y_pred_dt_v)

Number of mislabeled points out of a total 2 points : 0
1.0


## Sequence

#### Naive Bayes

In [200]:
y_pred_svm_s = svm_class(X_train_s,y_train_s,X_test_s)
accuracy_print(X_test_s,y_test_s,y_pred_svm_s)



Number of mislabeled points out of a total 2 points : 1
0.5




#### SVM

In [199]:
y_pred_svm_s = svm_class(X_train_s,y_train_s,X_test_s)
accuracy_print(X_test_s,y_test_s,y_pred_svm_s)



Number of mislabeled points out of a total 2 points : 1
0.5




#### Desicion Tree

In [202]:
y_pred_dt_s = desicion_tree(X_train_s,y_train_s,X_test_s,grid_search=0)
accuracy_print(X_test_s,y_test_s,y_pred_dt_s)

Number of mislabeled points out of a total 2 points : 1
0.5


## Numeric

#### Naive Bayes

In [208]:
y_pred_svm_n = svm_class(X_train_n,y_train_n,X_test_n)
accuracy_print(X_test_n,y_test_n,y_pred_svm_n)



Number of mislabeled points out of a total 2 points : 1
0.5




#### SVM

In [209]:
y_pred_svm_n = svm_class(X_train_n,y_train_n,X_test_n)
accuracy_print(X_test_n,y_test_n,y_pred_svm_n)



Number of mislabeled points out of a total 2 points : 1
0.5




#### Desicion Tree

In [210]:
y_pred_dt_n = desicion_tree(X_train_n,y_train_n,X_test_n,grid_search=0)
accuracy_print(X_test_n,y_test_n,y_pred_dt_n)

Number of mislabeled points out of a total 2 points : 1
0.5
