In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
import pandas as pd
import pickle
from sklearn import metrics, tree, svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,cross_val_score,train_test_split,LeaveOneOut
from sklearn.naive_bayes import MultinomialNB

from data_load import *
from dictionaries import *

# Define Model Paramaterss

### Define Directory

In [None]:
d1 = 'data/d1.csv'

### Define Columns

In [None]:
nc0 = [
        'problem_type', 
        'creative', 
        'outdoors', 
        'career',
        'group_work', 
        'liked_courses', 
        'disliked_courses', 
        'programming',
        'join_clubs', 
        'not_clubs', 
        'liked_projects',
        'disliked_projects',
        'tv_shows', 
        'alternate_degree', 
        'expensive_equipment', 
        'drawing',
        'essay', 
        'architecture', 
        'automotive', 
        'business', 
        'construction',
        'health',
        'environment', 
        'manufacturing', 
        'technology',
        'new_programming',
        'program'
        ]

nc36 = [
        'creative',
        'outdoors',
        'career',
        'group_work',
        'liked_courses',
        'disliked_courses',
        'join_clubs',
        'not_clubs',
        'liked_projects',
        'disliked_projects',
        'alternate_degree',
        'drawing',
        'essay',
        'architecture',
        'automotive',
        'business',
        'construction',
        'health',
        'environment',
        'manufacturing',
        'technology',
        'new_programming',
        'program'
        ]

### Define Data Balance Dictionary

In [None]:
b0 = False # this is only relevant when we want to use untreated data for code d0

b1 = {
    'mech': 100,
    'bmed': 100,
    'swe': 100,
    'tron': 100,
    'cive': 100,
    'chem': 100,
    'syde': 100,
    'msci': 100,
    'ce': 100,
    'elec': 100,
    'nano': 100,
    'geo': 100,
    'env': 100,
    'arch-e': 100,
    'arch': 100
    }

b4 = {
    'mech': 100,
    'bmed': 100,
    'swe': 30,
    'tron': 100,
    'cive': 100,
    'chem': 100,
    'syde': 100,
    'msci': 100,
    'ce': 100,
    'elec': 100,
    'nano': 100,
    'geo': 100,
    'env': 100,
    'arch-e': 100,
    'arch': 100
    }

v0 = 1 

 # <font color='red'> Set Up Parameters for the Current Experiment</font> 

In [None]:
experiment_model_name = 'd1_b0_nc36_v0'
directory = d1
data_balance = b0
column_list = nc36
data_balance_multiple = v0 # Ratio of other programs to program in binary classifier. 2 means double of other programs, 0.5 means half

test_vector = [0] * (len(column_list)-1)
test_vector = np.array(test_vector).reshape(1, -1)

### Defining different Encoding

In [None]:
ohe =  [
        'problem_type', 
        'creative', 
        'outdoors', 
        'career',
        'group_work', 
        'liked_courses', 
        'disliked_courses', 
        'programming',
        'join_clubs', 
        'not_clubs', 
        'liked_projects',
        'disliked_projects',
        'tv_shows', 
        'alternate_degree', 
        'expensive_equipment', 
        'drawing',
        'new_programming',
        'essay'
        ]

ohe = [value for value in ohe if value in  column_list]


## Multilabel Classifiers

### Naive Bayes - Label Encoded

In [None]:
model_name = 'nb_le_f0_'+ experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

mnb = MultinomialNB()
model = mnb.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

print(INV_INDEX_PROGRAM[model.predict(test_vector)[0]])

save_model(data,model,cat,model_name)
test_model(model_name,test_vector)

### Naive Bayes - One Hot Encoded

In [None]:
model_name = 'nb_ohe_f0_'+ experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

mnb = MultinomialNB()
model = mnb.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

save_model(data,model,cat,model_name)

### Logistic Regression - Label Encoded

In [None]:
model_name = 'lrr_le_f0_'+ experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
model = LRR.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

print(INV_INDEX_PROGRAM[model.predict(test_vector)[0]])

save_model(data,model,cat,model_name)
test_model(model_name,test_vector)

### Logistic Regression - One Hot Encoded

In [None]:
model_name = 'lrr_ohe_f0_'+ experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

LRR = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
model = LRR.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

save_model(data,model,cat,model_name)

### Support Vector Machine - Label Encoded


In [None]:
model_name = 'svm_le_f0_'+ experiment_model_name
data = get_label_encoded_data(directory,model_name,column_list,'H',data_balance=data_balance)[0]

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

SVM = svm.SVC(probability=True)
model = SVM.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

print(INV_INDEX_PROGRAM[model.predict(test_vector)[0]])

save_model(data,model,cat,model_name)
test_model(model_name,test_vector)

### Support Vector Machine - One Hot Encoded


In [None]:
model_name = 'svm_ohe_f0_'+ experiment_model_name
data = get_merged_encoded_data(directory,model_name,one_hot_encode=ohe,column_list = column_list,drop_not_happy='H',data_balance=data_balance)

x_df = data.drop(axis=1,columns=["program"])
y_df = data["program"]

X = np.array(x_df) # convert dataframe into np array
Y = np.array(y_df) # convert dataframe into np array

SVM = svm.SVC(probability=True)
model = SVM.fit(X, Y) # fit the model using training data

cat = data.drop('program',axis=1)
cat = dict(zip(cat.columns,range(cat.shape[1])))

save_model(data,model,cat,model_name)