In [1]:
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.grid_search import ParameterGrid



In [2]:
# import data
df = pd.read_pickle('data/clean_speech_approvals.pkl')
df.set_index('president')

feat = ['approval', 'compound', 'positive', 'negative',
        'neutral', 'year', 'party', 'approve_mean', 'approve_median',
        'approve_min', 'approve_max', 'approve_std', 'disapprove_mean',
        'disapprove_median', 'disapprove_min', 'disapprove_max',
        'disapprove_std', 'num_words', 'num_sentences']

In [3]:
def binary_approval(approve):
    if approve > 50:
        return 1
    else:
        return 0

def to_binary(df, col, val='t'):
    '''
    Converts column to binary based on value
    '''
    df[col] = df[col].apply(lambda x: 1 if x == val else 0)

In [4]:
to_binary(df, 'party', 'D')

df['approval'] = df['approve_mean'].apply(binary_approval)

In [5]:
# create X, y vectors
X = df.filter(feat)
X = X.drop('approval', axis=1)
y = df['approval'].to_frame()

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

# validation split
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5, random_state=3)

# set model parameters
params = {'penalty': ['l1','l2'], 'C': [0.001,0.1,1,10]}

In [6]:
# Evaluation functions
# calculate precision, recall and auc metrics

def plot_roc(name, probs, true, output_type):
    fpr, tpr, thresholds = roc_curve(true, probs)
    roc_auc = auc(fpr, tpr)
    pl.clf()
    pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    pl.plot([0, 1], [0, 1], 'k--')
    pl.xlim([0.0, 1.05])
    pl.ylim([0.0, 1.05])
    pl.xlabel('False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title(name)
    pl.legend(loc="lower right")
    if (output_type == 'save'):
        plt.savefig(name)
    elif (output_type == 'show'):
        plt.show()
    else:
        plt.show()

def generate_binary_at_k(y_scores, k):
    cutoff_index = int(len(y_scores) * (k / 100.0))
    predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))]
    return predictions_binary

def precision_at_k(y_true, y_scores, k):
    #y_scores_sorted, y_true_sorted = zip(*sorted(zip(y_scores, y_true), reverse=True))
    y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))
    preds_at_k = generate_binary_at_k(y_scores_sorted, k)
    #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k)
    #precision = precision[1]  # only interested in precision for label 1
    precision = precision_score(y_true_sorted, preds_at_k)
    return precision

def recall_at_k(y_true, y_scores, k):
    #y_scores_sorted, y_true_sorted = zip(*sorted(zip(y_scores, y_true), reverse=True))
    y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))
    preds_at_k = generate_binary_at_k(y_scores_sorted, k)
    #precision, _, _, _ = metrics.precision_recall_fscore_support(y_true, preds_at_k)
    #precision = precision[1]  # only interested in precision for label 1
    recall = recall_score(y_true_sorted, preds_at_k)
    return recall

def f1_at_k(y_true, y_scores, k):
    preds_at_k = generate_binary_at_k(y_scores, k)
    return f1_score(y_true, preds_at_k, average='binary')

def get_feature_importance(clf, model_name):
    clfs = {'RF':'feature_importances',
            'LR': 'coef',
            'SVM': 'coef',
            'DT': 'feature_importances',
            'KNN': None,
            'AB': 'feature_importances',
            'GB': None,
            'linear.SVC': 'coef',
            'ET': 'feature_importances'
            }

    if clfs[model_name] == 'feature_importances':
        return  list(clf.feature_importances_)
    elif clfs[model_name] == 'coef':
        return  list(clf.coef_.tolist())
    else:
        return None

def plot_precision_recall_n(y_true, y_prob, model_name, output_type):
    from sklearn.metrics import precision_recall_curve
    y_score = y_prob
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score>=value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)

    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    ax1.set_ylim([0,1])
    ax1.set_ylim([0,1])
    ax2.set_xlim([0,1])

    name = model_name
    plt.title(name)
    if (output_type == 'save'):
        plt.savefig(name)
    elif (output_type == 'show'):
        plt.show()
    else:
        plt.show()

In [7]:
def clf_loop(X_train, X_test, y_train, y_test, params):
    '''
    Function to loop through hyperparameters in a model
    '''
    results_df = pd.DataFrame(columns=('clf','parameters','auc-roc','accuracy'))
    for p in ParameterGrid(params):
        try:
            clf = LogisticRegression()
            clf.set_params(**p)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            y_pred_probs = list(clf.predict_proba(X_test)[:,1])
            results_df.loc[len(results_df)] = [clf,p,roc_auc_score(y_test,y_pred_probs),
                                               accuracy_score(y_test,y_pred)]
            #plot_precision_recall_n(y_test, y_pred_probs, 'LR', 'show')
        except IndexError:
            print('IndexError')
            continue
    return results_df

In [8]:
# run methods loop
df_validation = clf_loop(X_train, X_test, y_train, y_test, params)

# obtain params for model with max auc-roc
best_model_params = dict(df_validation.sort_values('auc-roc', ascending=False)[:1]['parameters'])
key = list(best_model_params.keys())[0]

# evaluate using test data
lr = LogisticRegression()
lr.set_params(**best_model_params[key])
lr.fit(X_validation, y_validation)

# get predicted scores for test set
y_pred = lr.predict_proba(X_test)[:, 1]
y_pred

array([0.07987864, 0.55178373, 0.55328869, 0.32227806, 0.73842045,
       0.83837443, 0.20360718, 0.26207714, 0.57778655, 0.25874635,
       0.35862067, 0.95566582, 0.41597738])