# Compare models

## Loading feature sets

In [None]:
# defining constants
corpus_folder='corpus'
metadata_fn='corpus/metadata.txt'
target_col='sample_name'
dir_featuresets='featuresets'
kf_n_splits=5

In [None]:
# imports
import pandas as pd,os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)

In [None]:
# open all metadata and featuresets
df_meta=pd.read_csv(metadata_fn,encoding='utf-8',sep='\t').set_index('fn')
ALL_DFs = [(metadata_fn,df_meta)]
name2df={}
for fn in os.listdir(dir_featuresets):
    if fn.endswith('.txt'):
        name=fn.replace('.txt','')
        df=pd.read_csv(os.path.join(dir_featuresets,fn),encoding='utf-8',sep='\t').fillna(0)
        df = df.set_index('_fn' if '_fn' in df else 'fn')
        ALL_DFs+=[(fn,df)]
        name2df[name]=df

In [None]:
# the metadata
df_meta.shape

In [None]:
# The feature dfs
for name,df in name2df.items(): print name, df.shape

### Synchronize all dataframes (metadata + featuresets)

In [None]:
# Get indices/filenames that appear in every featureset
index_sets = [set(df.index) for (fn,df) in ALL_DFs]
shared_index = set.intersection(*index_sets)
shared_index = sorted(list(shared_index))

In [None]:
# How many shared indices are there?
len(shared_index)

In [None]:
# Rebalance all dfs to just the shared index
df_meta=df_meta[~df_meta.index.duplicated(keep='first')]
df_meta=df_meta.loc[shared_index]
df_meta.reindex(shared_index)
for i in list(range(len(ALL_DFs))):
    fn,df=ALL_DFs[i]
    name=fn.replace('.txt','')
    print fn,df.shape,
    df=df[~df.index.duplicated(keep='first')]
    df=df.loc[shared_index]
    df.reindex(shared_index)
    ALL_DFs[i]=(fn,df)
    if i>0: name2df[name]=df
    print ALL_DFs[i][1].shape

In [None]:
# The metadata now
df_meta.shape

In [None]:
# The feature data frames and their shapes
[(name,df.shape) for name,df in name2df.items()]

## Fine-tune

In [None]:
# Poesy columns
qcols_poesy=list(name2df['poesy'].select_dtypes('number').columns)
qcols_poesy=[x for x in qcols_poesy if x.startswith('rhymes_')]
qcols_poesy

In [None]:
name2df['poesy (just rhyme)']=name2df['poesy'][qcols_poesy]

In [None]:
# add combinations
name2df['poesy + wordfreqs']=name2df['poesy'].join(name2df['wordfreqs'])

## Classify!

In [None]:
# imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
import numpy as np

# basic classifier
def classify(Xdf,y,return_probs=False,standardize=True,leave_one_out=False,n_splits=kf_n_splits):
    """
    This is a very basic logisticregression model
    using leave-one-out cross-validation to make predictions.
    
    X: matrix of numerical data
    y: true classes for data (Sonnets, NotSonnets, ...)
    return_probs: return probability of guess rather than guess itself
    """
    loo=LeaveOneOut()
    kf = KFold(n_splits=n_splits,shuffle=True,random_state=11)
    all_predictions=[]
    all_probs=[]
    ind2prob={}
    ind2pred={}
    
    cols=Xdf.columns
    if standardize:
        from scipy.stats import zscore
        X=zscore(Xdf.values)
        
    from collections import defaultdict
    all_coeffs=defaultdict(list)
    #for train_index, test_index in loo.split(X):
    splitter = loo.split(X) if leave_one_out else kf.split(X)
    
    for train_index,test_index in splitter:
        # build new model
        clf = LogisticRegression(C=0.001)
        # slice
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        Xdf_train, Xdf_test = Xdf.iloc[train_index], Xdf.iloc[test_index]
        # fit
        clf.fit(X_train,y_train)
        probs=clf.predict_proba(X_test)
        predictions=clf.predict(X_test)
        #return clf,probs,predictions
        
        if leave_one_out:
            # predict probs
            prob=probs[0][1]
            all_probs+=[prob]
            # predict vals
            prediction=predictions[0]
            all_predictions+=[prediction]
            # get feature coefficients
        else:
            this_predictions=list(predictions)
            this_probs=[prob[1] for prob in probs]
            for i,index in enumerate(Xdf_test.index):
                ind2pred[index]=this_predictions[i]
                ind2prob[index]=this_probs[i]
            
        for col,coef in zip(cols,clf.coef_[0]): all_coeffs[col]+=[coef]
    # reorder if KF
    if not leave_one_out:
        all_predictions=[ind2pred[ind] for ind in Xdf.index]
        all_probs=[ind2prob[ind] for ind in Xdf.index]
    # avg feature coefficients
    for cf in all_coeffs: all_coeffs[cf]=np.mean(all_coeffs[cf])
    # return all this data
    return (all_predictions,all_probs,all_coeffs)

In [None]:
# Run classifier on a featureset
def classify_feat_df(df_feat,return_probs=False):
    y=np.array([df_meta.loc[idx][target_col] for idx in shared_index])
    Xdf=df_feat.select_dtypes('number').fillna(0)
    return classify(Xdf,y,return_probs=return_probs)

In [None]:
#all_predictions,all_probs,all_coeffs=classify_feat_df(name2df['poesy'])
#all_coeffs

In [None]:
# Initiate results and probabilities dataframe
df_results=pd.DataFrame(index=shared_index)
df_probs=pd.DataFrame(index=shared_index)
df_results[target_col]=df_meta[target_col]
df_probs[target_col]=df_meta[target_col]

In [None]:
# Classify all feature DFs and get all data
ld_feats=[]
feats_sofar=set()
for name,df in name2df.items():
    print '>> featureset:',name,'...'
    #if 'wordfreqs' in name: continue

    # classify
    preds,probs,featd=classify_feat_df(df)
    
    # save data
    df_results[name]=preds
    df_probs[name]=probs
    
    # save data about feature coefficients
    if not '+' in name:
        for feat,coef in featd.items():
            if feat in feats_sofar: continue
            feats_sofar|={feat}
            d_feats={'feat':feat,'model':name,'coeff':coef}
            ld_feats+=[d_feats]
df_feats=pd.DataFrame(ld_feats)
df_feats['abs(coeff)']=df_feats['coeff'].abs()

In [None]:
df_results

In [None]:
df_probs

In [None]:
df_feats

## Adding rule-based models

In [None]:
def poesy_rule_based_classifier_for_sonnets(row):
    if row['num_lines']!=14: return 'NotSonnets'
    #if row['meter_type_scheme']!='iambic': return 'NotSonnets'
    #if row['beat_scheme_repr']!='Pentameter': return 'NotSonnets'
    if row['meter_length_avg_line']<8: return 'NotSonnets'
    if row['meter_length_avg_line']>12: return 'NotSonnets'
    if not 'sonnet' in unicode(row['rhyme_scheme_name']).lower(): return 'NotSonnets'
    return 'Sonnets'

In [None]:
def loose_poesy_rule_based_classifier_for_sonnets(row,certainty_threshold=0.5,return_probs=False):
    counts_in_its_favor = []
    counts_in_its_favor += [row['num_lines']==14]
    counts_in_its_favor += [row['meter_type_scheme']=='iambic']
    counts_in_its_favor += [row['beat_scheme_repr']=='Pentameter']
    counts_in_its_favor += [row['meter_length_avg_line']>=8]
    counts_in_its_favor += [row['meter_length_avg_line']<=12]
    counts_in_its_favor += ['sonnet' in unicode(row['rhyme_scheme_name']).lower()]
    #counts_in_its_favor += ['sonnet' in unicode(row['rhyme_scheme_name']).lower()] # counts for double
    
    counts = [int(x) for x in counts_in_its_favor]
    certainty = sum(counts)/float(len(counts)) if counts else 0.0
    if return_probs: return certainty
    return 'Sonnets' if certainty>certainty_threshold else 'NotSonnets'

In [None]:
# Add rule based models' predictions to df_results
df_results['rule-based']=[
    poesy_rule_based_classifier_for_sonnets(row) for ind,row in name2df['poesy'].iterrows()
]
df_results['loosely_rule-based']=[
    loose_poesy_rule_based_classifier_for_sonnets(row) for ind,row in name2df['poesy'].iterrows()
]

In [None]:
# Add rule-based models' probabilities to df_probs
df_probs['rule-based_prob_sonnet']=[
    float(int(poesy_rule_based_classifier_for_sonnets(row)=='Sonnets')) for ind,row in name2df['poesy'].iterrows()
]

#loosely rule based
df_probs['loosely_rule-based_prob_sonnet']=[
    loose_poesy_rule_based_classifier_for_sonnets(row,return_probs=True) for ind,row in name2df['poesy'].iterrows()
]

In [None]:
#df_results

## Compare precision/recall

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
summaryld=[]
for col in df_results.columns:
    if col==target_col: continue
    sdx={}
    sdx['precision'],sdx['recall'],sdx['fscore'],sdx['support']=precision_recall_fscore_support(df_results[target_col], df_results[col], labels=['Sonnets'])
    for k,v in sdx.items(): sdx[k]=v[0]
    sdx['model']=col.replace('_',' ').title()
    summaryld+=[sdx]
summarydf=pd.DataFrame(summaryld)
summarydf=summarydf[['model','precision','recall','fscore']].sort_values('fscore',ascending=False).set_index('model')

In [None]:
summarydf.round(3)

## Precision/Recall curves

### Gathering probabilities

In [None]:
#df_probs

### Calculate the P/R Curves

In [None]:
from sklearn.metrics import precision_recall_curve
def prcurve(y_true,y_scores):
    return precision_recall_curve(list(y_true), y_scores.values, pos_label='Sonnets')

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 10]
#plt.figure(figsize=(600, ), dpi= 80, facecolor='w', edgecolor='k')

def plot_prcurve(y_true,y_scores,label,color='b'):
    from sklearn.utils.fixes import signature

    precision, recall, _ = prcurve(y_true,y_scores)
    from sklearn.metrics import average_precision_score
    average_precision = average_precision_score([int(yx=='Sonnets') for yx in y_true], y_scores)
    label = label.replace('_prob_sonnet','') + ' (AP=%s)' % round(average_precision,2)
    # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
    step_kwargs = ({'step': 'post'}
                   if 'step' in signature(plt.fill_between).parameters
                   else {})
    plt.step(recall, precision, color=color, alpha=1.0,
             where='post',label=label)
    #plt.fill_between(recall, precision, alpha=0.2, color=color, **step_kwargs)

In [None]:
#plot_prcurve(df_probs[target_col], df_probs['poesy_prob_sonnet'])
colors=['r','b','g','y','k','c']
for i,col in enumerate(df_probs):
    if col==target_col: continue
    plot_prcurve(df_probs[target_col], df_probs[col],color=colors[i],label=col)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Two-class Precision-Recall Curves\nfor {nmodel} models predicting sonnets (n={nc1}) vs. non-sonnets (n={nc2})'.format(
            nmodel=len(df_probs.columns)-1,nc1=len(df_meta[target_col])/2,nc2=len(df_meta[target_col])/2
))
plt.legend()
plt.savefig('figure.png')
plt.show()

## Examining features

In [995]:
pd.set_option('display.max_rows',None)
df_feats.sort_values('abs(coeff)',ascending=False)

Unnamed: 0,coeff,feat,model,abs(coeff)
1030,0.091062,meter_length_avg_line,poesy,0.091062
1043,0.091062,meter_length_avg_parse,poesy,0.091062
1118,-0.083521,rhymes_l11-l12,poesy,0.083521
1057,-0.081263,rhyme_scheme_acc_Couplet,poesy,0.081263
1081,0.07708,rhymes_l05-l08,poesy,0.07708
1100,-0.075575,rhymes_l01-l02,poesy,0.075575
1022,-0.070453,rhymes_l05-l06,poesy,0.070453
1107,0.061501,rhyme_scheme_acc_Sonnet H,poesy,0.061501
1075,-0.061058,rhymes_l09-l10,poesy,0.061058
1150,-0.059888,rhymes_l07-l08,poesy,0.059888


In [994]:
pd.set_option('display.max_rows',5)