# Individual Coding Exercise (ICE) 7

In this ICE 7, diagnostic metrics are covered. 1) identify the correct model diagnostic metric(s) for performance 2) implement a model diagnostic metric for a model that my team had built for Analysis Challenge Assignment 2.

In [1]:
#importing necessary packages

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

In [2]:
#Uploading the validation and training datasets

val = pd.read_csv("aca2_dataset_validation.csv")
train = pd.read_csv("aca2_dataset_training.csv")

In [3]:
#Data Cleaning by turning categorical variables into dummy variables 

df_val = pd.get_dummies(val, columns = ['SCHOOL', 'GRADE', 'Activity', 'ONTASK'])
df_train = pd.get_dummies(train, columns = ['SCHOOL', 'GRADE', 'Activity', 'ONTASK'])

In [4]:
#Turning the separate ONTASK columns into one

df_val = df_val.rename(columns = {'ONTASK_Y': 'ONTASK'})
df_train = df_train.rename(columns = {'ONTASK_Y': 'ONTASK'})

In [5]:
#All data greater than the average is coded as 1 and others are 0

df_val['NUMACTIVITIES_HIGH'] = (df_val['NumACTIVITIES'] > df_val['NumACTIVITIES'].mean()).astype(int)
df_train['NUMACTIVITIES_HIGH'] = (df_train['NumACTIVITIES'] > df_train['NumACTIVITIES'].mean()).astype(int)
df_val['NUMFORMATS_HIGH'] = (df_val['NumFORMATS'] > df_val['NumFORMATS'].mean()).astype(int)
df_train['NUMFORMATS_HIGH'] = (df_train['NumFORMATS'] > df_train['NumFORMATS'].mean()).astype(int)
df_val['OBSV/ACT_HIGH'] = (df_val['Obsv/act'] > df_val['Obsv/act'].mean()).astype(int)
df_train['OBSV/ACT_HIGH'] = (df_train['Obsv/act'] > df_train['Obsv/act'].mean()).astype(int)

In [6]:
df_val = df_val.drop(['UNIQUEID',
                    'Class',
                    'STUDENTID',
                    'ONTASK_N',
                    'totalobs-forsession',
                    'NumACTIVITIES',
                    'TRANSITIONS',
                    'NumFORMATS',
                    'FORMATchanges',
                    'Obsv/act',
                    'Transitions/Durations',
                    'Total Time'], axis=1)
df_train = df_train.drop(['UNIQUEID',
                    'Class',
                    'STUDENTID',
                    'CODER',
                    'OBSNUM',
                    'ONTASK_N',
                    'totalobs-forsession',
                    'NumACTIVITIES',
                    'TRANSITIONS',
                    'NumFORMATS',
                    'FORMATchanges',
                    'Obsv/act',
                    'Transitions/Durations',
                    'Total Time'], axis=1)

In [7]:
#Creating a function to create a decision tree model

y_tr = df_train['ONTASK']
def new_dec_tree (x_cols):
    Xs_tr = df_train[x_cols]
    dec_tree_model = DecisionTreeClassifier()
    dec_tree_model.fit(Xs_tr, y_tr)
    return dec_tree_model

In [8]:
#Function to test logistic regression model

y_va = df_val['ONTASK']
def test_dec_tree(x_cols, model):
    Xs_va = df_va[x_cols]
    pred = model.predict(Xs_va)
    print(confusion_matrix(y_va, pred))
    print(f"Accuracy Score: {accuracy_score(y_va, pred)}")

In [9]:
def dec_tree_tester (x_cols):
    new_model = new_dec_tree(x_cols)
    test_dec_tree(x_cols, new_model)

In [10]:
#finding model with highest accuracy score

x_cols = [
'school_a',
'school_b',
'school_c',
'school_d',
'school_e',
'grade_0',
'grade_1',
'grade_2',
'grade_3',
'grade_4',
'activity_dancing',
'activity_smallgroup',
'activity_testing',
'activity_wholedesks',
'numactivities_high',
'obsv/act_high'
]
dec_tree_tester(x_cols)
[[174 1675]
 [132 3566]]

SyntaxError: invalid syntax (<ipython-input-10-30c1bb9ffd04>, line 22)

In [None]:
#function to create a decision tree model

def new_dec_tree (x_cols):
    Xs_tr = df_tr[x_cols]
    dec_tree_model = DecisionTreeClassifier()
    dec_tree_model.fit(Xs_tr, y_tr)
    return dec_tree_model

In [None]:
#function to test logistic regression model

def test_dec_tree_2(x_cols, model):
    Xs_va = df_va[x_cols]
    pred = model.predict(Xs_va)
    print(confusion_matrix(y_va, pred))
    print(f"Accuracy Score: {accuracy_score(y_va, pred)}")
    print(f"Precision Score: {precision_score(y_va, pred)}")
    print(f"Recall Score: {recall_score(y_va, pred)}")
    print(f"F1 Score: {f1_score(y_va, pred)}")

In [None]:
def dec_tree_tester_2 (x_cols):
    new_model = new_dec_tree(x_cols)
    test_dec_tree_2(x_cols, new_model)

dec_tree_tester_2(x_cols)
[[174 1675]
 [132 3566]]

In [14]:
#create function to calculate the AUC score and the ROC plot

def new_dec_tree (x_cols):
    Xs_tr = df_tr[x_cols]
    dec_tree_model = DecisionTreeClassifier()
    dec_tree_model.fit(Xs_tr, y_tr)
    return dec_tree_model

def auc_roc (x_cols, model):
    Xs_va = df_va[x_cols]
    pred_proba = model.predict_proba(Xs_va)
    pred_proba_positive = pred_proba[:, 1]
    
    #Find AUC score
    auc = roc_auc_score(y_va, pred_proba_positive)
    print(f'AUC Score: {auc}')
    
    #Create ROC curve
    fpr, npr, _ = roc_curve(y_va, pred_proba_positive)
    pyplot.plot(fpr, npr)
    pyplot.xlabel('False Positive Rate')
    pyplot.ylabel('True Positive Rate')
    pyplot.show()
    
def auc_roc_generator (x_cols):
    new_model = new_dec_tree(x_cols)
    auc_roc(x_cols, new_model)