In [123]:
%matplotlib inline

import dfmaker
import modelmaker
import AnaFunc

reload(dfmaker)
reload(modelmaker)
reload(AnaFunc)

import time

from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import numpy as np
import re

import matplotlib.pyplot as plt
import pandas as pd

from sklearn import linear_model
from sklearn import metrics
from sklearn.utils import shuffle

import pickle

import warnings
warnings.filterwarnings('ignore')

# Functions

## Utilities to Make Variables

In [124]:
def snipstr(x,mystr):
    x = re.sub("\S*%s\S*" % mystr,"",x)
    return x

def remove_word(df,mylist):
    for w in mylist:
        df['taglist'] = df['taglist'].apply(lambda x, y: snipstr(x,y), args=(w,))
        df['words'] = df['words'].apply(lambda x, y: snipstr(x,y), args=(w,))
    return df

In [125]:
def MakeNorms(df,v1,v2):
    
    x1,x2,y = modelmaker.retrieve_vocab(v1,v2,df)

    norm1 = np.zeros(x1.shape[0])
    for i,x in enumerate(x1):
        norm1[i] = np.linalg.norm(x)

    norm2 = np.zeros(x2.shape[0])
    for i,x in enumerate(x2):
        norm2[i] = np.linalg.norm(x)

    return norm1,norm2

In [126]:
def MakeFeatures(df,v1,v2,downsample=False):
    
    if downsample:
        df_class1 = df[df["evtclass"]==1]
        df_class0 = df[df["evtclass"]==0]
    
        size0 = df_class1.shape[0]
        df_class0 = df_class0.sample(size0)
        df = pd.concat([df_class1,df_class0],ignore_index=True)
        df = shuffle(df,random_state=20)
    
    x1,x2,y = modelmaker.retrieve_vocab(v1,v2,df)

    norm1 = np.zeros(x1.shape[0])
    for i,x in enumerate(x1):
        norm1[i] = np.linalg.norm(x)

    norm2 = np.zeros(x1.shape[0])
    for i,x in enumerate(x2):
        norm2[i] = (np.linalg.norm(x))

    #print norm1.shape, norm2.shape
        
    norm1 = norm1.reshape(len(norm1),1)
    norm2 = norm2.reshape(len(norm2),1)
    
    x = np.hstack([norm1,norm2])  
    
    #print x.shape
    return x,y

## Drawing and Analysis Functions

In [127]:
def DrawROCandThreshLog(y_test,result_prob,title):
      
    plt.rcParams['figure.figsize'] = (6.0, 6.0)
    plt.rcParams['font.size'] =  10    
        
    y_real = y_test
    y_score = result_prob

        
    #print y_real.shape
    #print y_score.shape
    fpr, tpr, thresh = metrics.roc_curve(y_real, y_score, pos_label=1)
        
    roc_auc = metrics.auc(fpr, tpr,reorder=True)
    
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title("Cut Threshold versus Selection Rate: %s" % title)
    plt.legend(loc="lower right")
    plt.show()
    
    plt.figure()
    plt.plot(thresh, tpr, label='True Positive Rate')
    plt.plot(thresh, fpr, label='False Positive Rate')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Threshold')
    plt.ylabel('Rate')
    plt.title('Cut Threshold versus Selection Rate: %s' % title)
    plt.legend(loc="upper right")
    plt.show()
    
    #return tpr, fpr, thresholds

In [128]:
def ReturnROC(y_test,result_prob,tval,unmarked=False):
      
    y_real = y_test
    y_score = result_prob

    fpr, tpr, thresh = metrics.roc_curve(y_real, y_score, pos_label=1)
        
    roc_auc = metrics.auc(fpr, tpr,reorder=True)
    
    for i,t in enumerate(thresh):

        """
        if unmarked and t < tval:
            return roc_auc, t, tpr[i], fpr[i]
        
        if (not unmarked) and (tpr[i] > 0.95 and t<1):
            return roc_auc, t, tpr[i], fpr[i]
        
        """
        
        if t < tval:
            return roc_auc, t, tpr[i], fpr[i]
        
        #if tpr[i] > 0.60 and t<1 and not got60:
        #    print "TP=60p: thresh=%f, FP=%f" % (t,fpr[i])
        #    got60=True
        #    dict_cuts['60p']=t
    
    #return roc_auc

## Training and Testing Functions

In [129]:
def TrainLog(df_train,v1,v2):

    X_train,Y_train = MakeFeatures(df_train,v1,v2,True)

    logreg = linear_model.LogisticRegression(C=1)
    
    logreg.fit(X_train, Y_train)
    
    return logreg

In [130]:
def TestLog(logreg,df_test,v1,v2):

    X_test,Y_test = MakeFeatures(df_test,v1,v2,False)

    results = logreg.predict_proba(X_test)[:,1]

    Y_test = Y_test.reshape(len(Y_test),1)
    
    return results,Y_test
    

In [131]:
def TrainTestAna(df_train,df_test,v1,v2,tval,unmarked):
    logreg = TrainLog(df_train,v1,v2)
    results,Y_test = TestLog(logreg,df_test,v1,v2)
    roc,t,tpr,fpr = ReturnROC(Y_test,results,tval,unmarked)
    #DrawROCandThreshLog(Y_test,results,"")
    return roc,t,tpr,fpr

## High Level Analysis Macros

In [132]:
def RegisterToFile(tag,nolist,v1,v2,model,myconfig,cutval,pddf):
    
    #Get DF with Tag + NonTag
    df_train = dfmaker.get_nolist_dfs(tag,nolist,myconfig,True)
    print df_train['evtclass'].value_counts()
    
    #Train Vocab based only on Tag:
    _,_,_,v1,v2 = modelmaker.train_vocab(vect_text,vect_tag, \
                                             df_train[df_train["evtclass"]==1],downsample=False)

    #Train a logistic regression
    logreg = TrainLog(df_train,v1,v2)

    #Some file information:
    path = "/Users/ruthtoner/CodingMacros/ProjectInsight/Fanguard/files"
    fmname = "pfilter_%s_model.pkl" % tag
    v1name = "pfilter_%s_v1.pkl" % tag
    v2name = "pfilter_%s_v2.pkl" % tag 
    
    data = {"name":tag,"mp":fmname, \
            "v1p":v1name, "v2p":v2name, \
            "cut_pre":cutval}
    
    #Add a new row to mysql frame:
    pddf = pddf.append(data,ignore_index=True)
    
    #Dump the files:
    with open('%s/%s' % (path,fmname), 'wb') as fid:
        pickle.dump(logreg, fid)    
    with open('%s/%s' % (path,v1name), 'wb') as fid:
        pickle.dump(v1, fid)   
    with open('%s/%s' % (path,v2name), 'wb') as fid:
        pickle.dump(v2, fid)  
    
    return pddf
    #return v1,v2,logreg

In [133]:
def TestPrefilter(tag,nolist,myconfig,vect_text,vect_tag,tval,unmarked=False,wlist=[]):
    df_total = dfmaker.get_nolist_dfs(tag,nolist,myconfig)
    
    
    lroc=np.zeros(len(nolist))
    lthr=np.zeros(len(nolist))
    ltpr=np.zeros(len(nolist))
    lfpr=np.zeros(len(nolist))

    for i,noltag in enumerate(nolist):
        print "%d : Testing %s against others, for Filter=%s" % (i+1,noltag,tag)
        
        df_train,df_test = dfmaker.GenerateTestTrainFront(df_total,i+1)
        
        if unmarked:
            df_test = remove_word(df_test,wlist)
        
        _,_,_,v1,v2 = modelmaker.train_vocab(vect_text,vect_tag, \
                                             df_train[df_train["evtclass"]==1],downsample=False)
        roc,tbest,tpr,fpr = TrainTestAna(df_train,df_test,v1,v2,tval,unmarked)
        lroc[i]=roc
        ltpr[i]=tpr
        lfpr[i]=fpr
        lthr[i]=tbest
        
        print "ROC=%f, T=%f, TPR=%f, FPR=%f" % (roc,tbest,tpr,fpr)
        
    print "At cut > %f, average ROC=%f, TPR=%f, FPR=%f, best T=%f" % (tval,lroc.mean(),ltpr.mean(),\
                                                                       lfpr.mean(),lthr.mean())

# Testing the PreFilters and Making Official Versions

## Models, Vocab, Config

In [134]:
myconfig = "/Users/ruthtoner/CodingMacros/ProjectInsight/myconfigs.cfg"

#Vectorizer:
vect_text = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 50, \
                             max_df = 1.0)

vect_tag = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 50, \
                             max_df = 1.0, \
                             binary = True)

model_logreg = linear_model.LogisticRegression(C=1)

df_mysql = pd.DataFrame(columns=['name','mp','v1p','v2p','cut_pre'])

## Star Wars

In [135]:
nolist = ['dai','utale','lis','slock','got','han','aou','madmax']
print "Marked:"
TestPrefilter("sw",nolist,myconfig,vect_text,vect_tag,0.5)

Marked:
1 : Testing dai against others, for Filter=sw
ROC=0.989479, T=0.431870, TPR=1.000000, FPR=0.060864
2 : Testing utale against others, for Filter=sw
ROC=0.986486, T=0.351470, TPR=1.000000, FPR=0.090743
3 : Testing lis against others, for Filter=sw
ROC=0.985752, T=0.495089, TPR=1.000000, FPR=0.084400
4 : Testing slock against others, for Filter=sw
ROC=0.991929, T=0.466434, TPR=1.000000, FPR=0.046570
5 : Testing got against others, for Filter=sw
ROC=0.993621, T=0.338739, TPR=1.000000, FPR=0.041411
6 : Testing han against others, for Filter=sw
ROC=0.992629, T=0.320342, TPR=1.000000, FPR=0.046668
7 : Testing aou against others, for Filter=sw
ROC=0.985266, T=0.493933, TPR=1.000000, FPR=0.071465
8 : Testing madmax against others, for Filter=sw
ROC=0.978470, T=0.419773, TPR=1.000000, FPR=0.112081
At cut > 0.500000, average ROC=0.987954, TPR=1.000000, FPR=0.069275, best T=0.414706


In [136]:
print "Unmarked:"
TestPrefilter("sw",nolist,myconfig,vect_text,vect_tag,0.5,True,['star','war'])

Unmarked:
1 : Testing dai against others, for Filter=sw
ROC=0.822262, T=0.496080, TPR=0.597405, FPR=0.060280
2 : Testing utale against others, for Filter=sw
ROC=0.800174, T=0.477701, TPR=0.597284, FPR=0.089524
3 : Testing lis against others, for Filter=sw
ROC=0.797683, T=0.499808, TPR=0.597647, FPR=0.083669
4 : Testing slock against others, for Filter=sw
ROC=0.841892, T=0.389955, TPR=0.597223, FPR=0.045146
5 : Testing got against others, for Filter=sw
ROC=0.852409, T=0.477548, TPR=0.597647, FPR=0.039746
6 : Testing han against others, for Filter=sw
ROC=0.850696, T=0.497567, TPR=0.597829, FPR=0.046171
7 : Testing aou against others, for Filter=sw
ROC=0.801652, T=0.476869, TPR=0.598011, FPR=0.068536
8 : Testing madmax against others, for Filter=sw
ROC=0.782980, T=0.483601, TPR=0.597466, FPR=0.101342
At cut > 0.500000, average ROC=0.818718, TPR=0.597564, FPR=0.066802, best T=0.474891


In [116]:
df_mysql = RegisterToFile("sw",nolist,vect_text,vect_tag,model_logreg,myconfig,0.7,df_mysql)

0    219412
1     41231
Name: evtclass, dtype: int64


## Dragon Age

In [137]:
nolist = ['sw','utale','lis','slock','got','han','aou','madmax']
print "Marked:"
TestPrefilter("dai",nolist,myconfig,vect_text,vect_tag,0.5)

Marked:
1 : Testing sw against others, for Filter=dai
ROC=0.969500, T=0.498460, TPR=0.856345, FPR=0.077061
2 : Testing utale against others, for Filter=dai
ROC=0.956763, T=0.497676, TPR=0.856345, FPR=0.114925
3 : Testing lis against others, for Filter=dai
ROC=0.960260, T=0.497678, TPR=0.856345, FPR=0.105461
4 : Testing slock against others, for Filter=dai
ROC=0.980135, T=0.489899, TPR=0.856431, FPR=0.049368
5 : Testing got against others, for Filter=dai
ROC=0.911657, T=0.485648, TPR=0.856345, FPR=0.183141
6 : Testing han against others, for Filter=dai
ROC=0.978294, T=0.488484, TPR=0.856345, FPR=0.056740
7 : Testing aou against others, for Filter=dai
ROC=0.876445, T=0.497306, TPR=0.856860, FPR=0.283664
8 : Testing madmax against others, for Filter=dai
ROC=0.965748, T=0.431988, TPR=0.856345, FPR=0.089262
At cut > 0.500000, average ROC=0.949850, TPR=0.856420, FPR=0.119953, best T=0.485892


In [138]:
print "Unmarked:"
TestPrefilter("dai",nolist,myconfig,vect_text,vect_tag,0.5,True,['dai'])

Unmarked:
1 : Testing sw against others, for Filter=dai
ROC=0.889209, T=0.497341, TPR=0.741489, FPR=0.077037
2 : Testing utale against others, for Filter=dai
ROC=0.868702, T=0.491760, TPR=0.741403, FPR=0.115077
3 : Testing lis against others, for Filter=dai
ROC=0.875915, T=0.480862, TPR=0.741403, FPR=0.104939
4 : Testing slock against others, for Filter=dai
ROC=0.909811, T=0.490664, TPR=0.741403, FPR=0.049114
5 : Testing got against others, for Filter=dai
ROC=0.780301, T=0.484461, TPR=0.741403, FPR=0.183141
6 : Testing han against others, for Filter=dai
ROC=0.910597, T=0.495157, TPR=0.741403, FPR=0.056740
7 : Testing aou against others, for Filter=dai
ROC=0.748248, T=0.499837, TPR=0.741747, FPR=0.284543
8 : Testing madmax against others, for Filter=dai
ROC=0.881292, T=0.485116, TPR=0.741575, FPR=0.089262
At cut > 0.500000, average ROC=0.858009, TPR=0.741478, FPR=0.119981, best T=0.490650


In [119]:
df_mysql = RegisterToFile("dai",nolist,vect_text,vect_tag,model_logreg,myconfig,0.24,df_mysql)

0    231519
1     29079
Name: evtclass, dtype: int64


## Age of Ultron

In [139]:
nolist = ['sw','utale','lis','slock','got','han','dai','madmax']
print "Marked:"
TestPrefilter("aou",nolist,myconfig,vect_text,vect_tag,0.5)

Marked:
1 : Testing sw against others, for Filter=aou
ROC=0.995693, T=0.498882, TPR=1.000000, FPR=0.047198
2 : Testing utale against others, for Filter=aou
ROC=0.996160, T=0.497167, TPR=1.000000, FPR=0.055765
3 : Testing lis against others, for Filter=aou
ROC=0.997123, T=0.494898, TPR=1.000000, FPR=0.036791
4 : Testing slock against others, for Filter=aou
ROC=0.998222, T=0.487643, TPR=1.000000, FPR=0.019610
5 : Testing got against others, for Filter=aou
ROC=0.997940, T=0.469114, TPR=1.000000, FPR=0.029318
6 : Testing han against others, for Filter=aou
ROC=0.998238, T=0.394382, TPR=1.000000, FPR=0.023188
7 : Testing dai against others, for Filter=aou
ROC=0.989430, T=0.491595, TPR=1.000000, FPR=0.149720
8 : Testing madmax against others, for Filter=aou
ROC=0.992592, T=0.389128, TPR=1.000000, FPR=0.075168
At cut > 0.500000, average ROC=0.995675, TPR=1.000000, FPR=0.054595, best T=0.465351


In [None]:
print "Unmarked:"
TestPrefilter("aou",nolist,myconfig,vect_text,vect_tag,0.5,True,['age','of','ultron'])

Unmarked:
1 : Testing sw against others, for Filter=aou
ROC=0.902741, T=0.480090, TPR=0.633169, FPR=0.046324
2 : Testing utale against others, for Filter=aou
ROC=0.895200, T=0.495530, TPR=0.632802, FPR=0.055656
3 : Testing lis against others, for Filter=aou
ROC=0.914743, T=0.498141, TPR=0.633169, FPR=0.036112
4 : Testing slock against others, for Filter=aou
ROC=0.933463, T=0.495556, TPR=0.633718, FPR=0.019686
5 : Testing got against others, for Filter=aou
ROC=0.921458, T=0.426630, TPR=0.632619, FPR=0.029045
6 : Testing han against others, for Filter=aou
ROC=0.934962, T=0.430155, TPR=0.633352, FPR=0.022836
7 : Testing dai against others, for Filter=aou
ROC=0.913255, T=0.494919, TPR=0.633901, FPR=0.038135
8 : Testing madmax against others, for Filter=aou
ROC=0.880849, T=0.498341, TPR=0.633718, FPR=0.069799
At cut > 0.500000, average ROC=0.912084, TPR=0.633306, FPR=0.039699, best T=0.477420


In [None]:
df_mysql = RegisterToFile("aou",nolist,vect_text,vect_tag,model_logreg,myconfig,0.2,df_mysql)

## Mad Max

In [None]:
nolist = ['sw','utale','lis','slock','got','han','dai','aou']
print "Marked:"
TestPrefilter("madmax",nolist,myconfig,vect_text,vect_tag,0.5)
print "Unmarked:"
TestPrefilter("madmax",nolist,myconfig,vect_text,vect_tag,0.5,True,['mad','max'])

Marked:
1 : Testing sw against others, for Filter=madmax
ROC=0.944746, T=0.499243, TPR=1.000000, FPR=0.335996

In [None]:
df_mysql = RegisterToFile("madmax",nolist,vect_text,vect_tag,model_logreg,myconfig,0.2,df_mysql)

## Undertale

In [None]:
nolist = ['sw','madmax','lis','slock','got','han','dai','aou']
print "Marked:"
TestPrefilter("utale",nolist,myconfig,vect_text,vect_tag,0.5)
print "Unmarked:"
TestPrefilter("utale",nolist,myconfig,vect_text,vect_tag,0.5,True,['undertale'])

In [None]:
df_mysql = RegisterToFile("utale",nolist,vect_text,vect_tag,model_logreg,myconfig,0.2,df_mysql)

## Sherlock

In [None]:
nolist = ['sw','madmax','lis','utale','got','han','dai','aou']
print "Marked:"
TestPrefilter("slock",nolist,myconfig,vect_text,vect_tag,0.5)
print "Unmarked:"
TestPrefilter("slock",nolist,myconfig,vect_text,vect_tag,0.5,True,['sherlock'])

In [None]:
df_mysql = RegisterToFile("slock",nolist,vect_text,vect_tag,model_logreg,myconfig,0.2,df_mysql)

## Life is Strange

In [None]:
nolist = ['sw','madmax','slock','utale','got','han','dai','aou']
print "Marked:"
TestPrefilter("lis",nolist,myconfig,vect_text,vect_tag,0.5)
print "Unmarked:"
TestPrefilter("lis",nolist,myconfig,vect_text,vect_tag,0.5,True,['life','is','strange'])

In [None]:
df_mysql = RegisterToFile("lis",nolist,vect_text,vect_tag,model_logreg,myconfig,0.2,df_mysql)

## Game of Thrones

In [None]:
nolist = ['sw','madmax','slock','utale','lis','han','dai','aou']
print "Marked:"
TestPrefilter("got",nolist,myconfig,vect_text,vect_tag,0.5)
print "Unmarked:"
TestPrefilter("got",nolist,myconfig,vect_text,vect_tag,0.5,True,['game','of','thrones'])

In [None]:
df_mysql = RegisterToFile("got",nolist,vect_text,vect_tag,model_logreg,myconfig,0.2,df_mysql)

## Hannibal

In [None]:
nolist = ['sw','madmax','slock','utale','lis','got','dai','aou']
print "Marked:"
TestPrefilter("han",nolist,myconfig,vect_text,vect_tag,0.5)
print "Unmarked:"
TestPrefilter("han",nolist,myconfig,vect_text,vect_tag,0.5,True,['hannibal'])

In [None]:
df_mysql = RegisterToFile("han",nolist,vect_text,vect_tag,model_logreg,myconfig,0.2,df_mysql)