In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
import pandas as pd
import pickle
from sklearn import metrics, tree, svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,cross_val_score,train_test_split,LeaveOneOut
from sklearn.naive_bayes import MultinomialNB

from data_load import *
from dictionaries import *

# Define the Model Parameters

model_naming_convention = model-type_encoding_directory_datastructure_column-set_version


example: nb_le_f0_d0_c0_v0 is a  model which is label encoded using data set 1, column set 1, version 1 on the basic untreated data set for a family of multi-label classifiers

model_type will be appended to the front of the model name as it is run through each of the 

Link to model building log: https://docs.google.com/spreadsheets/d/1py4RVZ0er_JDeJo-oxY29QT6__EWHIeU6zBgp-q8Wog/edit?usp=sharing

In [None]:
directory = 'data/quiz_data.csv'
#model_name = 'model-type_encoding_directory_datastructure_column-set_version'
experiment_model_name = 'test'

column_list = [
                'problem_type', 
                'creative', 
                'outdoors', 
                'career',
                'group_work', 
                'liked_courses', 
                'disliked_courses', 
                'programming',
                'join_clubs', 
                'not_clubs', 
                'liked_projects',
                'disliked_projects',
                'tv_shows', 
                'alternate_degree', 
                'expensive_equipment', 
                'drawing',
                'essay', 
                'architecture', 
                'automotive', 
                'business', 
                'construction',
                'health',
                'environment', 
                'manufacturing', 
                'technology',
                'program'
                ]

data_balance = {
                'mech': 10,
                'bmed': 10,
                'swe': 10,
                'tron': 10,
                'cive': 10,
                'chem': 10,
                'syde': 10,
                'msci': 10,
                'ce': 10,
                'elec': 10,
                'nano': 10,
                'geo': 10,
                'env': 10,
                'arch-e': 10,
                'arch': 10
                }
# data_balance = False # this is only relevant when we want to use untreated data for code d0

data_balance_multiple = 1 # Ratio of other programs to program in binary classifier. 2 means double of other programs, 0.5 means half

test_vector = [0] * (len(column_list)-1)
test_vector = np.array(test_vector).reshape(1, -1)

### Define Encoding

 For each new type of encoding defined (other than the default label encoding) we need to define a new list of variables which are to be one hot encoded. This list name should match the encoding code that you will place in the dictionary in the model building google sheet.

 For each new type of encoding created, a new code block needs to be added under each model under each classfier family. Then, copy the code for the one hot encoded models and change the one_hot_encode list to the new list you created for this type of encoding. Once all the code blocks are added, you can run those cells!

In [None]:
ohe =  [
        'problem_type', 
        'creative', 
        'outdoors', 
        'career',
        'group_work', 
        'liked_courses', 
        'disliked_courses', 
        'programming',
        'join_clubs', 
        'not_clubs', 
        'liked_projects',
        'disliked_projects',
        'tv_shows', 
        'alternate_degree', 
        'expensive_equipment', 
        'drawing',
        'essay'
        ]

m0 =  [
        'problem_type', 
        'creative', 
        'outdoors', 
        'career',
        'group_work', 
        'liked_courses', 
        'disliked_courses', 
        'programming',
        'join_clubs', 
        'not_clubs', 
        'liked_projects',
        'disliked_projects',
        'tv_shows', 
        'alternate_degree', 
        'expensive_equipment', 
        'drawing',
        'essay'
        ]


m1 =  [
        'problem_type', 
        'creative', 
        'outdoors', 
        'career',
        'group_work', 
        'liked_courses', 
        'disliked_courses', 
        'programming',
        'join_clubs', 
        'not_clubs', 
        'liked_projects',
        'disliked_projects',
        'tv_shows', 
        'alternate_degree', 
        'expensive_equipment', 
        'drawing',
        'essay'
        ] 

## Multilabel Classifiers

### Naive Bayes - Label Encoded

In [None]:
model_name = 'nb_le_f0_'+ experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H')[0]
data = data[column_list]

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

mnb = MultinomialNB()
model = mnb.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

print(INV_INDEX_PROGRAM[model.predict(test_vector)[0]])

save_model(model,cat,model_name)
test_model(model_name,test_vector)

### Naive Bayes - One Hot Encoded

In [None]:
model_name = 'nb_ohe_f0_'+ experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H')

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

mnb = MultinomialNB()
model = mnb.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

save_model(model,cat,model_name)

### Logistic Regression - Label Encoded

In [None]:
model_name = 'lrr_le_f0_'+ experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H')[0]

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
model = LRR.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

print(INV_INDEX_PROGRAM[model.predict(test_vector)[0]])

save_model(model,cat,model_name)
test_model(model_name,test_vector)

### Logistic Regression - One Hot Encoded

In [None]:
model_name = 'lrr_ohe_f0_'+ experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H')

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
model = LRR.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

save_model(model,cat,model_name)

### Support Vector Machine - Label Encoded

In [None]:
model_name = 'svm_le_f0_'+ experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H')[0]

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

SVM = svm.SVC(probability=True)
model = SVM.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

print(INV_INDEX_PROGRAM[model.predict(test_vector)[0]])

save_model(model,cat,model_name)
test_model(model_name,test_vector)

### Support Vector Machine - One Hot Encoded

In [None]:
model_name = 'svm_ohe_f0_'+ experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H')

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

SVM = svm.SVC(probability=True)
model = SVM.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

save_model(model,cat,model_name)

## Binary Classifiers

### Naive Bayes -  Label Encoded

In [None]:
model_name = 'nb_le_f1_'+experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H')[0]
mnb = model_type = MultinomialNB()
binary_classifier(data,model_name,data_balance_multiple,mnb)

### Naive Bayes - One Hot Encoded

In [None]:
model_name = 'nb_ohe_f1_'+experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H')
mnb = model_type = MultinomialNB()
binary_classifier(data,model_name,data_balance_multiple,mnb)

### Logistic Regression - Label Encoded

In [None]:
model_name = 'lrr_le_f1_'+experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H')[0]
LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
binary_classifier(data,model_name,data_balance_multiple,LRR)

### Logistic Regression - One Hot Encoded

In [None]:
model_name = 'lrr_ohe_f1_'+experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H')
LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
binary_classifier(data,model_name,data_balance_multiple,LRR)

### Support Vector Machine - Label Encoded

In [None]:
model_name = 'svm_le_f1_'+experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H')[0]
SVM = svm.SVC(probability=True)
binary_classifier(data,model_name,data_balance_multiple,SVM)

### Support Vector Machine - One Hot Encoded

In [None]:
model_name = 'svm_ohe_f1_'+experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H')
SVM = svm.SVC(probability=True)
binary_classifier(data,model_name,data_balance_multiple,SVM)

### Decision Tree -  Label Encoded

In [None]:
model_name = 'tree_le_f1_'+experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H')[0]
ent = tree.DecisionTreeClassifier()
binary_classifier(data,model_name,data_balance_multiple,ent)

### Decision Tree - One Hot Encoded

In [None]:
model_name = 'tree_ohe_f1_'+experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H')
ent = tree.DecisionTreeClassifier()
binary_classifier(data,model_name,data_balance_multiple,ent)