In [7]:
import ClassiferHelperAPI as CH
import importlib
import numpy as np
import pandas as pd
importlib.reload(CH)
from ast import literal_eval
import plotly.plotly as py
import htmltag as HT
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_online()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

### Training the classifier and computing the performance metrics

In [17]:
hasSparse = False

if hasSparse:
    data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")
    ftrList = ['SPECIES','SEX','AGE','QUALITY','VIEW_POINT','INDIVIDUAL_NAME','CONTRIBUTOR','tags']
    allAttribs = CH.genAttribsHead(data,ftrList)
else:
    data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")
    ftrList = ['SPECIES','SEX','AGE','QUALITY','VIEW_POINT','tags']
    allAttribs = CH.genAttribsHead(data,ftrList)

In [18]:
methods = ['logistic','svm','dtree','random_forests']

classifiers = []
for method in methods:
    for i in np.arange(0.1,1,0.1):
        clfObj = CH.buildBinClassifier(data,allAttribs,i,80,method,hasSparse,True)
        clfObj.runClf()
        classifiers.append(clfObj)



In [19]:
printableClfs = []

for clf in classifiers:
    printableClfs.append(dict(literal_eval(clf.__str__())))
    
df = pd.DataFrame(printableClfs)
df = df[['methodName','splitPercent','accScore','precision','recall','f1Score','auc','sqerr']]
df.columns = ['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']
df.to_csv("../ClassifierResults/extrmClfMetrics_NoSparse.csv",index=False)

In [21]:
# Will take up valuable Plot.ly plots per day. Limited to 50 plots per day.
# changes to file name important
iFrameBlock = []
for i in np.arange(0.1,1,0.1):
    df1 = df[(df['Train-Test Split']==i)]
    df1.index = df1['Classifier']
    df1 = df1[['Accuracy','Precision','Recall','F1 score','AUC','Squared Error']].transpose()
    fig = df1.iplot(kind='bar',filename=str('Train-Test_Split_Ratio_NoSparse %f' %i),title=str('Train-Test Split Ratio: %f' %i))
    iFrameBlock.append(fig.embed_code)

with open("../ClassifierResults/performanceComparisonsNoSparse.html","w") as perf:
    perf.write(HT.h1("Performance Comparisons of Classifiers without Sparse Attributes."))
    for row in iFrameBlock:
        perf.write(HT.HTML(row))

### Calculating weights of features in the classifiers

In [None]:
clfWeights = []
for clf in classifiers:
    clfAttribs = list(clf.test_x.columns)
    if clf.methodName == 'logistic':
        clfAttribWgts = list(clf.clfObj.coef_[0])
    elif clf.methodName == 'dtree' or clf.methodName == 'random_forests':
        clfAttribWgts = list(clf.clfObj.feature_importances_)
    else:
        continue
        
        
    attribWgt = {clfAttribs[i] : clfAttribWgts[i] for i in range(len(clfAttribs))}
    attribWgt['Method'] = clf.methodName
    attribWgt['Split_Percent'] = clf.splitPercent
        
    clfWeights.append(attribWgt)

In [None]:
clfDf = pd.DataFrame(clfWeights)
clfDf

In [None]:
indDF = clfDf[(clfDf['Method']=='logistic')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/LogisiticWeights.csv")

indDF = clfDf[(clfDf['Method']=='dtree')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/DecisionTreeWeights.csv")

indDF = clfDf[(clfDf['Method']=='random_forests')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/RandomForestsWeights.csv")

In [None]:
fig