###load packages

In [None]:
%pylab inline
import pandas as pd
import psycopg2
import sklearn
import seaborn as sns
from sklearn import preprocessing,cross_validation,svm,metrics,tree,decomposition
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                              GradientBoostingClassifier,
                              AdaBoostClassifier)
from sklearn.linear_model import LogisticRegression, SGDClassifier,Perceptron,OrthogonalMatchingPursuit,RandomizedLogisticRegression
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import ParameterGrid
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

from sqlalchemy import create_engine
sns.set_style("white")

###load data

In [None]:
# set up sqlalchemy engine
engine = create_engine('postgresql://10.10.2.10/appliedda')

# See all available schemas:
pd.read_sql("SELECT schema_name FROM information_schema.schemata LIMIT 10;", engine)

In [None]:
# We can look at column names within tables:
df=pd.read_sql("SELECT * FROM c6.partial_evaluate WHERE (oldSpell_end>='2010-05-31' AND oldSpell_end<='2012-12-31');",engine)
print(df.shape)

In [None]:
isnan_training_rows = df.isnull().any(axis=1) # Find the rows where there are NaNs
df[isnan_training_rows].head()

In [None]:
isnan_training_columns = df.isnull().any(axis=0) # Find the columns where there are NaNs
isnan_training_columns[isnan_training_columns==True].item

In [None]:
# list of variables that need to be turn to dummies: 'quarter_t', 'edlevel','workexp','district',
#'race','sex','rootrace','foreignbn'
#quarter_t
df['t_q1']=(df['quarter_t']==1)
df['t_q2']=(df['quarter_t']==2)
df['t_q3']=(df['quarter_t']==3)
#edlevel
df['edu_hs'] = (df['edlevel']==1)
df['edu_hsgrad'] = (df['edlevel']==2)
df['edu_coll'] = (df['edlevel']==3)
df['edu_collgrad'] = (df['edlevel']==4)
#workexp
df['work_prof'] = (df['workexp'] == 2)
df['work_othermgr'] = (df['workexp'] == 3)
df['work_clerical'] = (df['workexp'] == 4)
df['work_sales'] = (df['workexp'] == 5)
df['work_crafts'] = (df['workexp'] == 6)
df['work_oper'] = (df['workexp'] == 7)
df['work_service'] = (df['workexp'] == 8)
df['work_labor'] = (df['workexp'] == 9)
#district
df['dist_cookcty'] = (df['district'] ==1)
df['dist_downstate'] = (df['district'] ==0)
#race from assistance case: 1,2,3
df['race_1'] = (df['race'] == 1)
df['race_2'] = (df['race'] == 2)
#sex
df['male'] = (df['sex'] == 1)
df['female'] = (df['sex'] == 2)
#rootrace
df['hh_white'] = (df['rootrace'] == 1)
df['hh_black'] = (df['rootrace'] == 2)
df['hh_native'] = (df['rootrace'] == 3)
df['hh_hispanic'] = (df['rootrace'] == 6)
df['hh_asian'] = (df['rootrace'] == 7)
#foreignbn: 0,1,2,3,4,5
df['foreignbn_1'] = (df['foreignbn'] == 1)
df['foreignbn_2'] = (df['foreignbn'] == 2)
df['foreignbn_3'] = (df['foreignbn'] == 3)
df['foreignbn_4'] = (df['foreignbn'] == 4)
df['foreignbn_5'] = (df['foreignbn'] == 5)



In [None]:
#df[(df['oldspell_end'] >=datetime.date(2010,5,31)) & (df['oldspell_end'] <=datetime.date(2010,12,31))]

trainset['age']=trainset['age'].fillna(trainset['age'].mean())

deleted due to disclosure control

trainset[trainset['spell_cancel']==1]['return_1yr'].value_counts(normalize=True)

In [None]:
#testset.crosstab['return_1yr','spell_cancel']

###mask the data and normalize

### split into features and labels

In [None]:
# label and feature global
sel_label='return_1yr'
sel_features=['foodstamp','tanf','spell_cancel','num_emp_tp4','wage_sum_tp4','wage_high_tp4','num_emp_tp3',
             'wage_sum_tp3','wage_high_tp3','num_emp_tp2','wage_sum_tp2','wage_high_tp2','num_emp_tp1','wage_sum_tp1',
             'wage_high_tp1','num_emp_tm1','wage_sum_tm1','wage_high_tm1','num_emp_tm2','wage_sum_tm2',
             'wage_high_tm2','num_emp_tm3','wage_sum_tm3','wage_high_tm3','num_emp_tm4','wage_sum_tm4',
             'wage_high_tm4','wage_sum_tp1t4','wage_sum_tm1t4','spell_length','n_prespells','max_spell_length',
'min_spell_length','avg_spell_length','total_foodstamp_utlnow','total_tanf_utlnow','marstat','homeless','hh_counts',
't_q1','t_q2','t_q3','edu_hs','edu_hsgrad','edu_coll','edu_collgrad','work_prof','work_othermgr','work_clerical',
'work_sales','work_crafts','work_oper','work_service','work_labor','dist_cookcty','dist_downstate','race_1',
'race_2','male','female','hh_white','hh_black','hh_native','hh_hispanic','hh_asian','foreignbn_1','foreignbn_2',
'foreignbn_3','foreignbn_4','foreignbn_5']
sel_features_spell=['foodstamp','tanf','spell_cancel','spell_length','n_prespells','max_spell_length',
                    'min_spell_length','avg_spell_length','total_foodstamp_utlnow','total_tanf_utlnow']
sel_features_wage=['num_emp_tm1','wage_sum_tm1','wage_high_tm1',
              'num_emp_tm2','wage_sum_tm2','wage_high_tm2',
             'num_emp_tm3','wage_sum_tm3','wage_high_tm3',
             'num_emp_tm4','wage_sum_tm4','wage_high_tm4','wage_sum_tm1t4']
sel_features_demo=['age','marstat','homeless','hh_counts',
't_q1','t_q2','t_q3','edu_hs','edu_hsgrad','edu_coll','edu_collgrad','work_prof','work_othermgr','work_clerical',
'work_sales','work_crafts','work_oper','work_service','work_labor','dist_cookcty','dist_downstate','race_1',
'race_2','male','female','hh_white','hh_black','hh_native','hh_hispanic','hh_asian','foreignbn_1','foreignbn_2',
'foreignbn_3','foreignbn_4','foreignbn_5']

Accuracy is the ratio of the correct predictions (both positive and negative) to all predictions. 
$$ Accuracy = \frac{TP+TN}{TP+TN+FP+FN} $$

Two additional metrics that are often used are **precision** and **recall**. 

Precision measures the accuracy of the classifier when it predicts an example to be positive. It is the ratio of correctly predicted positive examples to examples predicted to be positive. 

$$ Precision = \frac{TP}{TP+FP}$$

Recall measures the accuracy of the classifier to find positive examples in the data. 

$$ Recall = \frac{TP}{TP+FN} $$

By selecting different thresholds we can vary and tune the precision and recall of a given classifier. A conservative classifier (threshold 0.99) will classify a case as 1 only when it is *very sure*, leading to high precision. On the other end of the spectrum, a low threshold (e.g. 0.01) will lead to higher recall.

from sklearn.metrics import precision_score, recall_score
precision = precision_score(expected, predicted)
recall = recall_score(expected, predicted)
print( "Precision = " + str( precision ) )
print( "Recall= " + str(recall))

In [None]:
def plot_precision_recall(y_true,y_score):
    """
    Plot a precision recall curve
    
    Parameters
    ----------
    y_true: ls
        ground truth labels
    y_score: ls
        score output from model
    """
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true,y_score)
    plt.plot(recall_curve, precision_curve)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    auc_val = auc(recall_curve,precision_curve)
    print('AUC-PR: {0:1f}'.format(auc_val))
    plt.show()
    plt.clf()

plot_precision_recall(expected, y_scores)

###precision and recall at k%

In [None]:
def plot_precision_recall_n(y_true, y_prob, model_name):
    """
    y_true: ls 
        ls of ground truth labels
    y_prob: ls
        ls of predic proba from model
    model_name: str
        str of model name (e.g, LR_123)
    """
    from sklearn.metrics import precision_recall_curve
    y_score = y_prob
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score>=value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)
    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax1.set_ylim(0,1.05)
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    ax2.set_ylim(0,1.05)
    
    name = model_name
    plt.title(name)
    plt.show()
    plt.clf()

In [None]:
def precision_at_k(y_true, y_scores,k):
    
    threshold = np.sort(y_scores)[::-1][int(k*len(y_scores))]
    y_pred = np.asarray([1 if i >= threshold else 0 for i in y_scores ])
    return precision_score(y_true, y_pred)

plot_precision_recall_n(expected,y_scores, 'LR')

p_at_1 = precision_at_k(expected,y_scores, 0.01)
print('Precision at 1%: {:.2f}'.format(p_at_1))

### magic loops

In [None]:
def define_clfs_params(grid_size):
    clfs={'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
       'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        'AB':AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm='SAMME',n_estimators=200),
        'LR': LogisticRegression(penalty='l1', C=1e5),
        'SVM':svm.SVC(kernel='linear',probability='true',random_state=0),
        'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6,n_estimators=10),
        'NB': GaussianNB(),
        'DT':DecisionTreeClassifier(),
        'SGD':SGDClassifier(loss='hinge',penalty='l2'),
        'KNN':KNeighborsClassifier(n_neighbors=3)
        }
    small_grid={
        'RF':{'n_estimators':[100,10000],'max_depth':[5,50],'max_features':['sqrt','log2'],'min_samples_split':[2,10]},
        #'LR':{'penalty':['l1'],'C':[0.1]},
        'LR':{'penalty':['l1','l2'],'C':[0.00001,0.001,0.1,1,10]},
        'SGD':{'loss':['hinge','log','perceptron'],'penalty':['l2','l1','elasticnet']},
        'ET':{'n_estimators':[10,100],'criterion':['gini','entropy'],'max_depth':[5,50],'max_features':['sqrt','log2'],
             'min_samples_split':[2,10]},
        'AB':{'algorithm':['SAMME','SAMME.R'],'n_estimators':[1,10,100,1000,10000]},
        'GB':{'n_estimators':[10,100],'learning_rate':[0.001,0.1,0.5],'subsample':[0.1,0.5,1.0],'max_depth':[5,50]},
        'NB':{},
        'DT':{'criterion':['entropy'],'max_depth':[1,10,50],
              'min_samples_split':[2,5,10]},
        'SVM':{'C':[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
        'KNN':{'n_neighbors':[1,5,10,25,50,100],'weights':['uniform','distance'],
               'algorithm':['auto','ball_tree','kd_tree']}
    }
    if (grid_size=='small'):
        return clfs,small_grid

In [None]:
def joint_sort_descending(l1,l2):
    #l1,l2 have to be numpy arrays
    idx=np.argsort(l1)[::-1]
    return l1[idx],l2[idx]

In [None]:
def generate_binary_at_k(y_scores,k):
    cutoff_index=int(len(y_scores)*(k/100.0))
    test_predictions_binary=[1 if x<cutoff_index else 0 for x in range(len(y_scores))]
    return test_predictions_binary

In [None]:
def precision_at_k(y_true, y_scores,k):
    
    y_scores,y_true=joint_sort_descending(np.array(y_scores),np.array(y_true))
    preds_at_k=generate_binary_at_k(y_scores,k)
    precision=precision_score(y_true,preds_at_k)
    return precision

In [None]:
def clf_loop(models_to_run,clfs,grid):
    results_df=pd.DataFrame(columns=('model_type','clf','parameters','auc-roc','p_at_5','p_at_10','p_at_10'))
    for year in range(2011,2012):
        
        #create training and test set
        year_minus_one=year-1
        print(year)
        print(year_minus_one)
        df_training=df[(df['oldspell_end']>=datetime.date(2010,5,31)) & (df['oldspell_end'] <=
                                                              datetime.date(year_minus_one,12,31))].copy()
        df_testing=df[(df['oldspell_end']>=datetime.date(2010,5,31)) & (df['oldspell_end'] <=
                                                              datetime.date(year,12,31))].copy()
        #training_sql="""SELECT * FROM c6.partial_evaluate WHERE (oldSpell_end>='2010-05-31'
        #AND oldSpell_end<='{year_minus_one}-12-31')""".format(year_minus_one=year_minus_one)
        #testing_sql="""SELECT *  FROM c6.partial_evaluate WHERE (oldSpell_end>='{year}-01-01' 
        #AND oldSpell_end<='{year}-12-31')""".format(year=year)
        print(df_training.shape)
        print(df_testing.shape)
       
        ##give pd dataframe a name
        #trainset = pd.read_sql(training_sql, engine)
        #testset = pd.read_sql(testing_sql, engine)
        #df_training.fillna(df_training.mean(),inplace=True)
        #df_testing.fillna(df_testing.mean(),inplace=True)
        print('Imputed with mean')
        df_training['age'].fillna(df_training['age'].mean(), inplace = True) 
        df_testing['age'].fillna(df_testing['age'].mean(), inplace=True)
        
        
        print("create np.array") 
        y_train=df_training[sel_label].values
        X_train=df_training[sel_features].values
        y_test=df_testing[sel_label].values
        X_test=df_testing[sel_features].values
        
        print('Scaling training and testing sets')
        
        #scaler=Normalizer().fit(X_train)
        #normalized_X_train=scaler.transform(X_train)
        #normalized_X_test=scaler.transform(X_test)
        #print ("std of un-normalized features for X_train:{}".format(np.std(X_train)))
        #print ("std of un-normalized features for X_test:{}".format(np.std(X_test)))
        #print ("std of normalized features for X_train:{}".format(np.std(normalized_X_train)))
        #print ("std of normalized features for X_test:{}".format(np.std(normalized_X_test)))
        #X_train=normalized_X_train
        #X_test=normalized_X_test
        scaler=StandardScaler().fit(X_train)
        scalerStandardScaler(with_mean=False)
        scaled_X_train=scaler.transform(X_train)
        scaled_X_test=scaler.transform(X_test)
        print ("mean of un-scaled features for X_train:{}".format(np.mean(X_train)))
        print ("mean of un-scaled features for X_test:{}".format(np.mean(X_test)))
        print ("mean of normalized features for X_train:{}".format(np.mean(scaled_X_train)))
        print ("mean of normalized features for X_test:{}".format(np.mean(scaled_X_test)))
        X_train=scaled_X_train
        X_test=scaled_X_test
        print('baseline: Number of rows: {}'.format(df_training.shape[0]))
        print(df_training['return_1yr'].value_counts(normalize=True))
        
        for index,clf in enumerate([clfs[x] for x in models_to_run]):
            print models_to_run[index]
            parameter_values=grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    y_pred_probs=clf.fit(X_train,y_train).predict_proba(X_test)[:,1]
                    #you can also store the model, feature importances, and prediction scores
                    #we're only string the metrics for now
                    y_pred_probs_sorted,y_test_sorted=zip(*sorted(zip(y_pred_probs,y_test),reverse=True))
                    results_df.loc[len(results_df)]=[models_to_run[index],clf,p,
                                                     roc_auc_score(y_test,y_pred_probs),
                                                     precision_at_k(y_test_sorted,y_pred_probs_sorted,5.0),
                                                     precision_at_k(y_test_sorted,y_pred_probs_sorted,10.0),
                                                     precision_at_k(y_test_sorted,y_pred_probs_sorted,20.0)]
                    if NOTEBOOK==1:
                        plot_precision_recall_n(y_test,y_pred_probs,clf)
                except IndexError,e:
                    print 'Error:',e
                    continue
    return results_df

In [None]:
def main():
    grid_size='small'
    clfs,grid=define_clfs_params(grid_size)
    models_to_run=['LR','RF','DT']
    #'RF','DT','KNN','ET','AB','GB','NB'
    #df=pd.read_csv("user/nnnnn.")
    #features=['','']
    #X=df[features]
    #y=df.return_1yr
    results_df=clf_loop(models_to_run,clfs,grid)
    if NOTEBOOK==1:
        results_df
    results_df.to_csv('results.csv',index=True)
    
    #master_results_df = pd.read_csv('results.csv')
    #master_results_df = master_results_df.append(results_df)
    #master_results_df.to_csv('results.csv')
    

In [None]:
NOTEBOOK=1

In [None]:
#if _name_=='_main_':
main()