In [2]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
import pandas as pd
import pickle
from sklearn import metrics, tree, svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,cross_val_score,train_test_split,LeaveOneOut
from sklearn.naive_bayes import MultinomialNB

from data_load import *
from dictionaries import *

# Define the Model Parameters

model_naming_convention = model-type_encoding_directory_datastructure_column-set_version


example: nb_le_f0_d0_b0_c0_v0 is a  model which is label encoded using data set 1, column set 1, version 1 on the basic untreated data set with no data treatment for a family of multi-label classifiers

model_type will be appended to the front of the model name as it is run through each of the 

Link to model building log: https://docs.google.com/spreadsheets/d/1py4RVZ0er_JDeJo-oxY29QT6__EWHIeU6zBgp-q8Wog/edit?usp=sharing

### Define Directory

In [3]:
d0 = 'data/d0.csv'

### Define Columns

In [4]:
c0 = [
        'problem_type', 
        'creative', 
        'outdoors', 
        'career',
        'group_work', 
        'liked_courses', 
        'disliked_courses', 
        'programming',
        'join_clubs', 
        'not_clubs', 
        'liked_projects',
        'disliked_projects',
        'tv_shows', 
        'alternate_degree', 
        'expensive_equipment', 
        'drawing',
        'essay', 
        'architecture', 
        'automotive', 
        'business', 
        'construction',
        'health',
        'environment', 
        'manufacturing', 
        'technology',
        'program'
        ]


c1 = [
        'architecture', 
        'automotive', 
        'business', 
        'construction',
        'health',
        'environment', 
        'manufacturing', 
        'technology',
        'program'
        ]

c2 =    [
        'problem_type', 
        'creative', 
        'outdoors', 
        'career',
        'group_work', 
        'liked_courses', 
        'disliked_courses', 
        'join_clubs', 
        'not_clubs', 
        'liked_projects',
        'disliked_projects',
        'tv_shows', 
        'alternate_degree', 
        'expensive_equipment', 
        'drawing',
        'essay', 
        'architecture', 
        'automotive', 
        'business', 
        'construction',
        'health',
        'environment', 
        'manufacturing', 
        'technology',
        'program'
        ]

c3 = [
    'architecture', 
    'automotive', 
    'business', 
    'construction', 
    'health', 
    'environment', 
    'manufacturing', 
    'technology',
    'program',
    'outdoors',
    'career',
    'liked_courses', 
    'disliked_courses',
    'join_clubs',
    'not_clubs',
    'liked_projects',
    'disliked_projects',
    'drawing'
        ]

c4 = [
        'alternate_degree',
        'architecture', 
        'automotive', 
        'business', 
        'construction',
        'health',
        'environment', 
        'manufacturing', 
        'technology',
        'program'
        ]


### Define Data Balance Dictionary

In [5]:
b0 = False # this is only relevant when we want to use untreated data for code d0

b1 = {
    'mech': 100,
    'bmed': 100,
    'swe': 100,
    'tron': 100,
    'cive': 100,
    'chem': 100,
    'syde': 100,
    'msci': 100,
    'ce': 100,
    'elec': 100,
    'nano': 100,
    'geo': 100,
    'env': 100,
    'arch-e': 100,
    'arch': 100
    }

### Define Data Balance Multiple

In [6]:
# Ratio of other programs to program in binary classifier. 2 means double of other programs, 0.5 means half
v0 = 1

 # <font color='red'> Set Up Parameters for the Current Experiment</font> 

In [7]:
#model_name = 'model-type_encoding_directory_datastructure_column-set_version'
# experiment_model_name = 'dataSet_dataBalance_columnSet_dataBalanceMultiple'
experiment_model_name = 'd0_b0_c4_v0'
directory = d0
data_balance = b0
column_list = c4
data_balance_multiple = v0 # Ratio of other programs to program in binary classifier. 2 means double of other programs, 0.5 means half

test_vector = [0] * (len(column_list)-1)
test_vector = np.array(test_vector).reshape(1, -1)

### Define Encoding

 For each new type of encoding defined (other than the default label encoding) we need to define a new list of variables which are to be one hot encoded. This list name should match the encoding code that you will place in the dictionary in the model building google sheet.

 For each new type of encoding created, a new code block needs to be added under each model under each classfier family. Then, copy the code for the one hot encoded models and change the one_hot_encode list to the new list you created for this type of encoding. Once all the code blocks are added, you can run those cells!

In [8]:
ohe =  [
        'problem_type', 
        'creative', 
        'outdoors', 
        'career',
        'group_work', 
        'liked_courses', 
        'disliked_courses', 
        'programming',
        'join_clubs', 
        'not_clubs', 
        'liked_projects',
        'disliked_projects',
        'tv_shows', 
        'alternate_degree', 
        'expensive_equipment', 
        'drawing',
        'essay'
        ]

ohe = [value for value in ohe if value in  column_list]

m0 =  [
        'problem_type', 
        'creative', 
        'outdoors', 
        'career',
        'group_work', 
        'liked_courses', 
        'disliked_courses', 
        'programming',
        'join_clubs', 
        'not_clubs', 
        'liked_projects',
        'disliked_projects',
        'tv_shows', 
        'alternate_degree', 
        'expensive_equipment', 
        'drawing',
        'essay'
        ]
m0 = [value for value in m0 if value in  column_list]

# m1 =  [
#         'problem_type', 
#         'creative', 
#         'outdoors', 
#         'career',
#         'group_work', 
#         'liked_courses', 
#         'disliked_courses', 
#         'programming',
#         'join_clubs', 
#         'not_clubs', 
#         'liked_projects',
#         'disliked_projects',
#         'tv_shows', 
#         'alternate_degree', 
#         'expensive_equipment', 
#         'drawing',
#         'essay'
#         ] 

## Multilabel Classifiers

### Naive Bayes - Label Encoded

In [9]:
model_name = 'nb_le_f0_'+ experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

mnb = MultinomialNB()
model = mnb.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

print(INV_INDEX_PROGRAM[model.predict(test_vector)[0]])

save_model(data,model,cat,model_name)
test_model(model_name,test_vector)

tron
nb_le_f0_d0_b0_c4_v0 created..
Loading CAT file...
Loading model...
Results:
{'arch': 0.0419, 'arch-e': 0.034, 'bmed': 0.0672, 'ce': 0.1041, 'chem': 0.0629, 'cive': 0.0658, 'elec': 0.0521, 'env': 0.0521, 'geo': 0.0188, 'mech': 0.1142, 'msci': 0.0853, 'nano': 0.0651, 'swe': 0.0564, 'syde': 0.0607, 'tron': 0.1193}


### Naive Bayes - One Hot Encoded

In [10]:
model_name = 'nb_ohe_f0_'+ experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

mnb = MultinomialNB()
model = mnb.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

save_model(data,model,cat,model_name)

nb_ohe_f0_d0_b0_c4_v0 created..


### Logistic Regression - Label Encoded

In [11]:
model_name = 'lrr_le_f0_'+ experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
model = LRR.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

print(INV_INDEX_PROGRAM[model.predict(test_vector)[0]])

save_model(data,model,cat,model_name)
test_model(model_name,test_vector)

mech
lrr_le_f0_d0_b0_c4_v0 created..
Loading CAT file...
Loading model...
Results:
{'arch': 0.0392, 'arch-e': 0.0361, 'bmed': 0.0362, 'ce': 0.0639, 'chem': 0.0874, 'cive': 0.1392, 'elec': 0.0451, 'env': 0.0354, 'geo': 0.0303, 'mech': 0.1724, 'msci': 0.059, 'nano': 0.1212, 'swe': 0.0351, 'syde': 0.0321, 'tron': 0.0674}




### Logistic Regression - One Hot Encoded

In [12]:
model_name = 'lrr_ohe_f0_'+ experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
model = LRR.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

save_model(data,model,cat,model_name)

lrr_ohe_f0_d0_b0_c4_v0 created..


### Support Vector Machine - Label Encoded

In [13]:
model_name = 'svm_le_f0_'+ experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

SVM = svm.SVC(probability=True)
model = SVM.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

print(INV_INDEX_PROGRAM[model.predict(test_vector)[0]])

save_model(data,model,cat,model_name)
test_model(model_name,test_vector)



nano
svm_le_f0_d0_b0_c4_v0 created..
Loading CAT file...
Loading model...
Results:
{'arch': 0.025, 'arch-e': 0.0187, 'bmed': 0.0401, 'ce': 0.0524, 'chem': 0.1186, 'cive': 0.0654, 'elec': 0.0662, 'env': 0.0772, 'geo': 0.0499, 'mech': 0.133, 'msci': 0.0594, 'nano': 0.1824, 'swe': 0.0168, 'syde': 0.0367, 'tron': 0.0581}


### Support Vector Machine - One Hot Encoded

In [14]:
model_name = 'svm_ohe_f0_'+ experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

SVM = svm.SVC(probability=True)
model = SVM.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

save_model(data,model,cat,model_name)



svm_ohe_f0_d0_b0_c4_v0 created..


## Binary Classifiers

### Naive Bayes -  Label Encoded

In [15]:
model_name = 'nb_le_f1_'+experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]
mnb = model_type = MultinomialNB()
binary_classifier(data,model_name,data_balance_multiple,mnb)

nb_le_f1_d0_b0_c4_v0_mech created..
nb_le_f1_d0_b0_c4_v0_bmed created..
nb_le_f1_d0_b0_c4_v0_swe created..
nb_le_f1_d0_b0_c4_v0_ce created..
nb_le_f1_d0_b0_c4_v0_tron created..
nb_le_f1_d0_b0_c4_v0_cive created..
nb_le_f1_d0_b0_c4_v0_chem created..
nb_le_f1_d0_b0_c4_v0_syde created..
nb_le_f1_d0_b0_c4_v0_msci created..
nb_le_f1_d0_b0_c4_v0_elec created..
nb_le_f1_d0_b0_c4_v0_nano created..
nb_le_f1_d0_b0_c4_v0_geo created..
nb_le_f1_d0_b0_c4_v0_env created..
nb_le_f1_d0_b0_c4_v0_arch-e created..
nb_le_f1_d0_b0_c4_v0_arch created..


### Naive Bayes - One Hot Encoded

In [16]:
model_name = 'nb_ohe_f1_'+experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)
mnb = model_type = MultinomialNB()
binary_classifier(data,model_name,data_balance_multiple,mnb)

nb_ohe_f1_d0_b0_c4_v0_mech created..
nb_ohe_f1_d0_b0_c4_v0_bmed created..
nb_ohe_f1_d0_b0_c4_v0_swe created..
nb_ohe_f1_d0_b0_c4_v0_ce created..
nb_ohe_f1_d0_b0_c4_v0_tron created..
nb_ohe_f1_d0_b0_c4_v0_cive created..
nb_ohe_f1_d0_b0_c4_v0_chem created..
nb_ohe_f1_d0_b0_c4_v0_syde created..
nb_ohe_f1_d0_b0_c4_v0_msci created..
nb_ohe_f1_d0_b0_c4_v0_elec created..
nb_ohe_f1_d0_b0_c4_v0_nano created..
nb_ohe_f1_d0_b0_c4_v0_geo created..
nb_ohe_f1_d0_b0_c4_v0_env created..
nb_ohe_f1_d0_b0_c4_v0_arch-e created..
nb_ohe_f1_d0_b0_c4_v0_arch created..


### Logistic Regression - Label Encoded

In [17]:
model_name = 'lrr_le_f1_'+experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]
LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
binary_classifier(data,model_name,data_balance_multiple,LRR)

lrr_le_f1_d0_b0_c4_v0_mech created..
lrr_le_f1_d0_b0_c4_v0_bmed created..
lrr_le_f1_d0_b0_c4_v0_swe created..
lrr_le_f1_d0_b0_c4_v0_ce created..
lrr_le_f1_d0_b0_c4_v0_tron created..
lrr_le_f1_d0_b0_c4_v0_cive created..
lrr_le_f1_d0_b0_c4_v0_chem created..
lrr_le_f1_d0_b0_c4_v0_syde created..
lrr_le_f1_d0_b0_c4_v0_msci created..
lrr_le_f1_d0_b0_c4_v0_elec created..
lrr_le_f1_d0_b0_c4_v0_nano created..
lrr_le_f1_d0_b0_c4_v0_geo created..
lrr_le_f1_d0_b0_c4_v0_env created..
lrr_le_f1_d0_b0_c4_v0_arch-e created..
lrr_le_f1_d0_b0_c4_v0_arch created..


### Logistic Regression - One Hot Encoded

In [18]:
model_name = 'lrr_ohe_f1_'+experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)
LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
binary_classifier(data,model_name,data_balance_multiple,LRR)

lrr_ohe_f1_d0_b0_c4_v0_mech created..
lrr_ohe_f1_d0_b0_c4_v0_bmed created..
lrr_ohe_f1_d0_b0_c4_v0_swe created..
lrr_ohe_f1_d0_b0_c4_v0_ce created..
lrr_ohe_f1_d0_b0_c4_v0_tron created..
lrr_ohe_f1_d0_b0_c4_v0_cive created..
lrr_ohe_f1_d0_b0_c4_v0_chem created..
lrr_ohe_f1_d0_b0_c4_v0_syde created..
lrr_ohe_f1_d0_b0_c4_v0_msci created..
lrr_ohe_f1_d0_b0_c4_v0_elec created..
lrr_ohe_f1_d0_b0_c4_v0_nano created..
lrr_ohe_f1_d0_b0_c4_v0_geo created..
lrr_ohe_f1_d0_b0_c4_v0_env created..
lrr_ohe_f1_d0_b0_c4_v0_arch-e created..
lrr_ohe_f1_d0_b0_c4_v0_arch created..


### Support Vector Machine - Label Encoded

In [19]:
model_name = 'svm_le_f1_'+experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]
SVM = svm.SVC(probability=True)
binary_classifier(data,model_name,data_balance_multiple,SVM)



svm_le_f1_d0_b0_c4_v0_mech created..
svm_le_f1_d0_b0_c4_v0_bmed created..
svm_le_f1_d0_b0_c4_v0_swe created..
svm_le_f1_d0_b0_c4_v0_ce created..
svm_le_f1_d0_b0_c4_v0_tron created..
svm_le_f1_d0_b0_c4_v0_cive created..
svm_le_f1_d0_b0_c4_v0_chem created..
svm_le_f1_d0_b0_c4_v0_syde created..
svm_le_f1_d0_b0_c4_v0_msci created..
svm_le_f1_d0_b0_c4_v0_elec created..
svm_le_f1_d0_b0_c4_v0_nano created..
svm_le_f1_d0_b0_c4_v0_geo created..
svm_le_f1_d0_b0_c4_v0_env created..
svm_le_f1_d0_b0_c4_v0_arch-e created..
svm_le_f1_d0_b0_c4_v0_arch created..




### Support Vector Machine - One Hot Encoded

In [20]:
model_name = 'svm_ohe_f1_'+experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)
SVM = svm.SVC(probability=True)
binary_classifier(data,model_name,data_balance_multiple,SVM)



svm_ohe_f1_d0_b0_c4_v0_mech created..
svm_ohe_f1_d0_b0_c4_v0_bmed created..
svm_ohe_f1_d0_b0_c4_v0_swe created..
svm_ohe_f1_d0_b0_c4_v0_ce created..
svm_ohe_f1_d0_b0_c4_v0_tron created..
svm_ohe_f1_d0_b0_c4_v0_cive created..
svm_ohe_f1_d0_b0_c4_v0_chem created..




svm_ohe_f1_d0_b0_c4_v0_syde created..
svm_ohe_f1_d0_b0_c4_v0_msci created..
svm_ohe_f1_d0_b0_c4_v0_elec created..
svm_ohe_f1_d0_b0_c4_v0_nano created..
svm_ohe_f1_d0_b0_c4_v0_geo created..
svm_ohe_f1_d0_b0_c4_v0_env created..
svm_ohe_f1_d0_b0_c4_v0_arch-e created..
svm_ohe_f1_d0_b0_c4_v0_arch created..




### Decision Tree -  Label Encoded

In [21]:
model_name = 'tree_le_f1_'+experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]
ent = tree.DecisionTreeClassifier()
binary_classifier(data,model_name,data_balance_multiple,ent)

tree_le_f1_d0_b0_c4_v0_mech created..
tree_le_f1_d0_b0_c4_v0_bmed created..
tree_le_f1_d0_b0_c4_v0_swe created..
tree_le_f1_d0_b0_c4_v0_ce created..
tree_le_f1_d0_b0_c4_v0_tron created..
tree_le_f1_d0_b0_c4_v0_cive created..
tree_le_f1_d0_b0_c4_v0_chem created..
tree_le_f1_d0_b0_c4_v0_syde created..
tree_le_f1_d0_b0_c4_v0_msci created..
tree_le_f1_d0_b0_c4_v0_elec created..
tree_le_f1_d0_b0_c4_v0_nano created..
tree_le_f1_d0_b0_c4_v0_geo created..
tree_le_f1_d0_b0_c4_v0_env created..
tree_le_f1_d0_b0_c4_v0_arch-e created..
tree_le_f1_d0_b0_c4_v0_arch created..


### Decision Tree - One Hot Encoded

In [22]:
model_name = 'tree_ohe_f1_'+experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)
ent = tree.DecisionTreeClassifier()
binary_classifier(data,model_name,data_balance_multiple,ent)

tree_ohe_f1_d0_b0_c4_v0_mech created..
tree_ohe_f1_d0_b0_c4_v0_bmed created..
tree_ohe_f1_d0_b0_c4_v0_swe created..
tree_ohe_f1_d0_b0_c4_v0_ce created..
tree_ohe_f1_d0_b0_c4_v0_tron created..
tree_ohe_f1_d0_b0_c4_v0_cive created..
tree_ohe_f1_d0_b0_c4_v0_chem created..
tree_ohe_f1_d0_b0_c4_v0_syde created..
tree_ohe_f1_d0_b0_c4_v0_msci created..
tree_ohe_f1_d0_b0_c4_v0_elec created..
tree_ohe_f1_d0_b0_c4_v0_nano created..
tree_ohe_f1_d0_b0_c4_v0_geo created..
tree_ohe_f1_d0_b0_c4_v0_env created..
tree_ohe_f1_d0_b0_c4_v0_arch-e created..
tree_ohe_f1_d0_b0_c4_v0_arch created..


# 