# Project McNulty - Classifying Stackoverflow Data

Paul Lim

05/17/2017

## Libraries and Default Settings

In [None]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import label_binarize


% matplotlib inline

sns.set_style("white")
sns.set_style('ticks')
sns.set_style({'xtick.direction': u'in', 'ytick.direction': u'in'})
sns.set_style({'legend.frameon': True})

cnx = create_engine('postgresql://plim0793:metis@54.215.141.213:5432/plim0793')

## Functions

In [1]:
def get_scores(model_list, X_train, X_test, y_train, y_test, binary=True):
    list_dict = {}
    list_dict['scores'] = []
    list_dict['models'] = []
    list_dict['precision'] = []
    list_dict['recall'] = []
    list_dict['f1'] = []
    
    for model in model_list:
        model = model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        if binary:
            pre = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
        else:
            pre = precision_score(y_test, y_pred, average='macro')
            rec = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')            
     
        list_dict['scores'].append(acc)
        list_dict['models'].append(model)
        list_dict['precision'].append(pre)
        list_dict['recall'].append(rec)
        list_dict['f1'].append(f1)
        
        print('Model: ', model)
        print('Score: ', acc)
        print('Precision: ', pre)
        print('Recall: ', rec)
        print('F1: ', f1)
        print('\n')
        
    return list_dict

def get_cross_val_score(model_list, X, y):
    list_dict = {}
    list_dict['mean_acc'] = []
    list_dict['mean_pre'] = []
    list_dict['mean_rec'] = []
    list_dict['mean_f1'] = []
    
    for model in model_list:
        acc = cross_val_score(model, X, y, scoring='accuracy')
        mean_acc = np.mean(acc)
        
        pre = cross_val_score(model, X, y, scoring='precision')
        mean_pre = np.mean(pre)
        
        rec = cross_val_score(model, X, y, scoring='recall')
        mean_rec = np.mean(rec)
        
        f1 = cross_val_score(model, X, y, scoring='f1')
        mean_f1 = np.mean(f1)
        
        list_dict['mean_acc'].append(mean_acc)
        list_dict['mean_pre'].append(mean_pre)
        list_dict['mean_rec'].append(mean_rec)
        list_dict['mean_f1'].append(mean_f1)
        print('Model: ', model)
        print('Accuracy: ', mean_acc)
        print('Precision: ', mean_pre)
        print('Recall: ', mean_rec)
        print('F1: ', mean_f1)
        print('\n')
        
    return list_dict