In [47]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
import pandas as pd
import pickle
from sklearn import metrics, tree, svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,cross_val_score,train_test_split,LeaveOneOut
from sklearn.naive_bayes import MultinomialNB
from statistics import mean

from data_load import *
from dictionaries import *

# Set the model experiment names and all of the used suffixes

keep the suffixes and the experiment model names updated to view all scores

In [34]:
experiment_model_names = [
#                         'd0_b0_c0_v0',
                        'd0_b0_c1_v0'
                        ]

# It takes about 10 mins to score each model?

In [17]:
column_list = [
                'problem_type', 
                'creative', 
                'outdoors', 
                'career',
                'group_work', 
                'liked_courses', 
                'disliked_courses', 
                'programming',
                'join_clubs', 
                'not_clubs', 
                'liked_projects',
                'disliked_projects',
                'tv_shows', 
                'alternate_degree', 
                'expensive_equipment', 
                'drawing',
                'essay', 
                'architecture', 
                'automotive', 
                'business', 
                'construction',
                'health',
                'environment', 
                'manufacturing', 
                'technology',
                'program'
                ]

In [18]:
multi_class_suffixes = [
                        'nb_le_f0_',
                        'nb_ohe_f0_',
                        'lrr_le_f0_',
                        'lrr_ohe_f0_',
                        'svm_le_f0_',
                        'svm_ohe_f0_'
                       ]

binary_class_suffixes = [
                        'nb_le_f1_',
                        'nb_ohe_f1_',
                        'lrr_le_f1_',
                        'lrr_ohe_f1_',
                        'svm_le_f1_',
                        'svm_ohe_f1_',
                        'tree_le_f1_',
                        'tree_ohe_f1_'
]

In [33]:
# This block does all of the scoring, last one to update, only cell that needs running
test_data_t7 = get_label_encoded_data('data/testing_data_t7.csv',model_name='t7',column_list=column_list,drop_not_happy='H',data_balance=False)[0]

for experiment in experiment_model_names:
    scoring_dictionary = {}
    
    for mclass in multi_class_suffixes:
        temp_model_name = mclass+experiment
        mclass_t3 = get_mclass_t3(temp_model_name)
        mclass_RR = get_mclass_rr(temp_model_name)
        mclass_accuracy = get_mclass_accuracy(temp_model_name)
        mclass_loo = get_mclass_loo(temp_model_name)
        mclass_5x = get_mclass_5x(temp_model_name)
        scoring_dictionary[mclass+experiment] = {'t3':mclass_t3,'RR':mclass_RR,'accuracy':mclass_accuracy,'loo':mclass_loo,'5x':mclass_5x}
   
    for bclass in binary_class_suffixes:
        temp_model_name = bclass+experiment
        bclass_t3 = get_bclass_t3(temp_model_name)
        bclass_RR = get_bclass_rr(temp_model_name)
        bclass_accuracy = get_bclass_accuracy(temp_model_name)
        bclass_loo = get_bclass_loo(temp_model_name)
        bclass_5x = get_bclass_5x(temp_model_name)
        scoring_dictionary[bclass+experiment] = {'t3':bclass_t3,'RR':bclass_RR,'accuracy':bclass_accuracy,'loo':bclass_loo,'5x':bclass_5x}
        
    save_scores(scoring_dictionary,experiment)    

# Functions to be exported later

In [1]:
def sort_probability_dict(p_df):
    ordered_probabilties = sorted(p_df.values(),reverse=True)
    ordered_programs = sorted(p_df, key=p_df.get,reverse=True)
    return [p_df, ordered_probabilties, ordered_programs]

In [2]:
def binary_predict_proba(vector,temp_model_name):
    return_probabilities_dict = {}
    for program in list(INDEX_PROGRAM.keys()):
        # Loading data used to build the model
        model_name = temp_model_name+'_'+program
        model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
        test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]

        # Converting program labels to their appropriate binary label BIN_CLAS
        temp_dictionary = INV_INDEX_PROGRAM.copy()
        for key in INV_INDEX_PROGRAM.keys():
            if str(key) != str(INDEX_PROGRAM[program]):
                temp_dictionary[key] = -1
            else:
                temp_dictionary[key] = INDEX_PROGRAM[program]
        test_data_t7_temp.program = test_data_t7_temp.program.map(temp_dictionary)

        # Loading model files
        pkl_file = open('exported_model_files/metadata/'+model_name+'_cat', 'rb')
        index_dict = pickle.load(pkl_file)
        new_vector = np.zeros(len(index_dict))

        pkl_file = open('exported_model_files/models/'+model_name+'.pkl', 'rb')
        model = pickle.load(pkl_file)

        return_probabilities_dict[program] = model.predict_proba([vector])[0][1]

    return (return_probabilities_dict)

In [29]:
def save_scores(scoring_dictionary,experiment_model_name):
    df = pd.DataFrame(scoring_dictionary) 
    df = df.T
    df.to_csv("exported_model_files/scores/"+experiment_model_name+".csv", header=True)

In [5]:
def get_mclass_accuracy(temp_model_name):
    model_name = temp_model_name
    model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
    test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]

    # Loading model files
    pkl_file = open('exported_model_files/metadata/'+model_name+'_cat', 'rb')
    index_dict = pickle.load(pkl_file)
    new_vector = np.zeros(len(index_dict))

    pkl_file = open('exported_model_files/models/'+model_name+'.pkl', 'rb')
    model = pickle.load(pkl_file)

    # Getting average accuracy score
    test_array = np.array(test_data_t7_temp.drop(axis=1,columns=["program"]))
    test_actual = np.array(test_data_t7_temp["program"])
    test_pred = []

    for i in range(len(test_array)):
        test_pred.append(model.predict([test_array[i]]))

    accuracy = metrics.accuracy_score(test_pred,test_actual)
    return accuracy

In [54]:
model_name = 'lrr_le_f0_d0_b0_c0_v0'
model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]

# Loading model files
pkl_file = open('exported_model_files/metadata/'+model_name+'_cat', 'rb')
index_dict = pickle.load(pkl_file)
new_vector = np.zeros(len(index_dict))

pkl_file = open('exported_model_files/models/'+model_name+'.pkl', 'rb')
model = pickle.load(pkl_file)

# Getting average accuracy score
test_array = np.array(test_data_t7_temp.drop(axis=1,columns=["program"]))
test_actual = np.array(test_data_t7_temp["program"])
t3_scores = []
for i in range(len(test_array)):
    prediction = model.predict_proba([test_array[i]])
    probs = sort_probability_dict(retrieve_prediction_labels(model,prediction))[2][:3]
    n_probs = []
    for prob in probs:
        n_probs.append(INDEX_PROGRAM[prob])
    try:
        t3 = (1/n_probs.index(test_actual[i]))
    except:
        t3 = 0
    t3_scores.append(t3)
    
return mean(t3_scores)

0.24047619047619048
[0, 0.5, 1.0, 0, 0.5, 0, 0, 1.0, 0, 0, 0.5, 0, 1.0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0.5, 0, 0, 0, 0, 0.5, 0, 0.5, 0.5, 0, 0, 0, 1.0, 1.0, 0, 1.0, 0, 1.0, 0, 1.0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0, 1.0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 1.0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 0.5, 0, 1.0, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 0, 0, 0.5, 0, 0.5, 0, 0.5, 1.0, 0.5, 1.0, 0, 0, 0, 0.5, 1.0, 0, 1.0, 0.5, 0, 0, 1.0, 1.0, 0, 1.0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0.5, 1.0, 1.0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0.5, 0.5, 0.5, 0.5, 0, 1.0, 0, 0, 0, 0, 0.5, 0, 0.5, 0, 0, 0]


In [6]:
def get_mclass_t3(temp_model_name):
    model_name = temp_model_name
    model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
    test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]

    # Loading model files
    pkl_file = open('exported_model_files/metadata/'+model_name+'_cat', 'rb')
    index_dict = pickle.load(pkl_file)
    new_vector = np.zeros(len(index_dict))

    pkl_file = open('exported_model_files/models/'+model_name+'.pkl', 'rb')
    model = pickle.load(pkl_file)

    # Getting average accuracy score
    test_array = np.array(test_data_t7_temp.drop(axis=1,columns=["program"]))
    test_actual = np.array(test_data_t7_temp["program"])
    t3_scores = []
    for i in range(len(test_array)):
        prediction = model.predict_proba([test_array[i]])
        probs = sort_probability_dict(retrieve_prediction_labels(model,prediction))[2][:3]
        n_probs = []
        for prob in probs:
            n_probs.append(INDEX_PROGRAM[prob])
        try:
            t3 = (1/n_probs.index(test_actual[i]))
        except:
            t3 = 0
        t3_scores.append(t3)

    return mean(t3_scores)

In [7]:
def get_mclass_rr(temp_model_name):
    model_name = temp_model_name
    model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
    test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]

    # Loading model files
    pkl_file = open('exported_model_files/metadata/'+model_name+'_cat', 'rb')
    index_dict = pickle.load(pkl_file)
    new_vector = np.zeros(len(index_dict))

    pkl_file = open('exported_model_files/models/'+model_name+'.pkl', 'rb')
    model = pickle.load(pkl_file)

    # Getting average accuracy score
    test_array = np.array(test_data_t7_temp.drop(axis=1,columns=["program"]))
    test_actual = np.array(test_data_t7_temp["program"])
    rr_scores = []
    for i in range(len(test_array)):
        prediction = model.predict_proba([test_array[i]])
        probs = sort_probability_dict(retrieve_prediction_labels(model,prediction))[2]
        n_probs = []
        for prob in probs:
            n_probs.append(INDEX_PROGRAM[prob])
        try:
            rr = (1/n_probs.index(test_actual[i]))
        except:
            rr = 0
        rr_scores.append(rr)

    return mean(rr_scores)

In [8]:
def get_mclass_loo(temp_model_name):
    mean_loo = 0
    return mean_loo

In [9]:
def get_mclass_5x(temp_model_name):
    mean_5x = 0
    return mean_5x

In [55]:
def get_bclass_accuracy(temp_model_name):
    model_name = temp_model_name

    model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
    test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]

    # Getting average accuracy score
    test_array = np.array(test_data_t7_temp.drop(axis=1,columns=["program"]))
    test_actual = np.array(test_data_t7_temp["program"])
    test_pred = []
    for i in range(len(test_array)):
        predicted = INDEX_PROGRAM[sort_probability_dict(binary_predict_proba(test_array[i],model_name))[2][0]]
        test_pred.append(predicted)

    accuracy = metrics.accuracy_score(test_pred,test_actual)

    return accuracy

In [None]:
def get_bclass_t3(temp_model_name):
    model_name = temp_model_name

    model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
    test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]

    # Getting average accuracy score
    test_array = np.array(test_data_t7_temp.drop(axis=1,columns=["program"]))
    test_actual = np.array(test_data_t7_temp["program"])
    test_pred = []

    for i in range(len(test_array)):
            probs = sort_probability_dict(binary_predict_proba(test_array[i],model_name))[2][:3]
            n_probs = []
            for prob in probs:
                n_probs.append(INDEX_PROGRAM[prob])
            try:
                t3 = (1/n_probs.index(test_actual[i]))
            except:
                t3 = 0
            t3_scores.append(t3)

    return mean(t3_scores)

In [11]:
def get_bclass_rr(temp_model_name):
    model_name = temp_model_name

    model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
    test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]

    # Getting average accuracy score
    test_array = np.array(test_data_t7_temp.drop(axis=1,columns=["program"]))
    test_actual = np.array(test_data_t7_temp["program"])
    test_pred = []

    for i in range(len(test_array)):
            probs = sort_probability_dict(binary_predict_proba(test_array[i],model_name))[2]
            n_probs = []
            for prob in probs:
                n_probs.append(INDEX_PROGRAM[prob])
            try:
                rr = (1/n_probs.index(test_actual[i]))
            except:
                rr = 0
            rr_scores.append(rr)

    return mean(rr_scores)

In [13]:
def get_bclass_loo(temp_model_name):
    mean_loo = 0
    return mean_loo

In [14]:
def get_bclass_5x(temp_model_name):
    mean_5x = 0
    return mean_5x

# b1_accuracy_score

In [21]:
'''
inputs:
- test_data
- experiment_model_name
'''
model_name = 'nb_le_f1_d0_b0_c1_v0_nano'
model_scores = {}
model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]
test_array = np.array(test_data_t7_temp.drop(axis=1,columns=["program"]))
test_actual = np.array(test_data_t7_temp["program"])


In [22]:
pkl_file = open('exported_model_files/metadata/'+model_name+'_cat', 'rb')
index_dict = pickle.load(pkl_file)
new_vector = np.zeros(len(index_dict))

pkl_file = open('exported_model_files/models/'+model_name+'.pkl', 'rb')
model = pickle.load(pkl_file)

In [23]:
# for i in range(len(test_array)):
#     print(model.predict([test_array[i]]),test_actual[i])

print(model.score(test_array,test_actual))

0.0


### Delete later

In [15]:
# This info should be under score_model_family()
experiment_model_name = 'd0_b0_c1_v0'
test_data_t7 = get_label_encoded_data('data/testing_data_t7.csv',model_name='t7',column_list=column_list,drop_not_happy='H',data_balance=False)[0]

for experiment in experiment_model_names:
    for mclass in multi_class_suffixes:
        print(mclass+experiment)
        x = 1
    experiment = 'd0_b0_c1_v0'
    for bclass in binary_class_suffixes:
        for program in list(INDEX_PROGRAM.keys()):
            # Loading data used to build the model
            model_name = (bclass+experiment+'_'+program)
            model_data = pd.read_csv('exported_model_files/dataframes/'+model_name+'.csv',dtype=str)
            test_data_t7_temp = test_data_t7.copy()[list(model_data.columns)]
            
            # Converting program labels to their appropriate binary label
            temp_dictionary = INV_INDEX_PROGRAM.copy()
            for key in INV_INDEX_PROGRAM.keys():
                if str(key) != str(INDEX_PROGRAM[program]):
                    temp_dictionary[key] = -1
                else:
                    temp_dictionary[key] = INDEX_PROGRAM[program]
            
            # Loading model files
            pkl_file = open('exported_model_files/metadata/'+model_name+'_cat', 'rb')
            index_dict = pickle.load(pkl_file)
            new_vector = np.zeros(len(index_dict))

            pkl_file = open('exported_model_files/models/'+model_name+'.pkl', 'rb')
            model = pickle.load(pkl_file)
            
            # Getting average accuracy score
            test_data_t7_temp.program = test_data_t7_temp.program.map(temp_dictionary)
            test_array = np.array(test_data_t7_temp.drop(axis=1,columns=["program"]))
            test_actual = np.array(test_data_t7_temp["program"])
            test_pred = []
            
            for i in range(len(test_array)):
                test_pred.append(model.predict([test_array[i]]))
            
            print(metrics.accuracy_score(test_pred,test_actual))
    break

nb_le_f0_d0_b0_c1_v0
nb_ohe_f0_d0_b0_c1_v0
lrr_le_f0_d0_b0_c1_v0
lrr_ohe_f0_d0_b0_c1_v0
svm_le_f0_d0_b0_c1_v0
svm_ohe_f0_d0_b0_c1_v0
nb_le_f1_d0_b0_c1_v0_mech
0.8857142857142857
nb_le_f1_d0_b0_c1_v0_bmed
0.8285714285714286
nb_le_f1_d0_b0_c1_v0_swe
0.8285714285714286
nb_le_f1_d0_b0_c1_v0_tron
0.7857142857142857
nb_le_f1_d0_b0_c1_v0_cive
0.819047619047619
nb_le_f1_d0_b0_c1_v0_chem
0.861904761904762
nb_le_f1_d0_b0_c1_v0_syde
0.7857142857142857
nb_le_f1_d0_b0_c1_v0_msci
0.7904761904761904
nb_le_f1_d0_b0_c1_v0_ce
0.8857142857142857
nb_le_f1_d0_b0_c1_v0_elec
0.8333333333333334
nb_le_f1_d0_b0_c1_v0_nano
0.6714285714285714
nb_le_f1_d0_b0_c1_v0_geo
0.8285714285714286
nb_le_f1_d0_b0_c1_v0_env
0.8523809523809524
nb_le_f1_d0_b0_c1_v0_arch-e
0.8047619047619048
nb_le_f1_d0_b0_c1_v0_arch
0.8571428571428571
nb_ohe_f1_d0_b0_c1_v0_mech
0.8857142857142857
nb_ohe_f1_d0_b0_c1_v0_bmed
0.8285714285714286
nb_ohe_f1_d0_b0_c1_v0_swe
0.8285714285714286
nb_ohe_f1_d0_b0_c1_v0_tron
0.7523809523809524
nb_ohe_f1_d0_b

In [None]:
for program in list(INDEX_PROGRAM.keys()):