### Notebook name: ImageShareabilityClassifiers.ipynb
#### Author: Sreejith Menon (smenon8@uic.edu)

### General Description:
Multiple features are extracted per image.    
The features are majorly classified as:
* Bilogical features like age, species, sex
* Ecological features like yaw, view_point
* Image EXIF/Quality data: unixtime, latitude, longitude, quality
* Tags generated by Microsoft Image tagging API
* Image Contributor - Sparse attribute
* Individual animals (NID)

Based on these features mutliple classification algorithms are implemented and the metrics are evaluated. The aim of the classification algorithms is to predict given features, will a certain image be shared/not shared on a social media platform.    
The ClassifierHelperAPI has *off-the-shelf* implementations from `sk-learn` library and uses a Classifier Object to store the metrics of each classifier.    
The performance metrics evaluated are:
* Accuracy - Number of correct predictions in the test data
* Precision 
* Recall
* F1 score
* Absolute Error
* AUC
* Squared Error - Not displayed currently
* Zero One Hinge Loss - Not displayed currently

In [60]:
import ClassiferHelperAPI as CH
import importlib
import numpy as np
import pandas as pd
importlib.reload(CH)
from ast import literal_eval
import plotly.plotly as py
import htmltag as HT
import cufflinks as cf # this is necessary to link pandas to plotly
cf.go_online()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from collections import Counter
import csv
import plotly.graph_objs as go

### Building data for the Classifier
* Discretizing non-binary data using the bag-of-words model
* Building and running the classifer for all train-test splits starting from 10% upto 90%
* Computing the performance metrics for each of the classifier.

In [None]:
allAttribs = CH.genAllAttribs("../FinalResults/ImgShrRnkListWithTags.csv","sparse","../data/infoGains.csv")
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")

In [None]:
# Block of code for building and running the classifier
# Will generate custom warnings, setting scores to 0, if there are no valid predictions
methods = ['logistic','svm','dtree','random_forests','ada_boost']
methods = ['ada_boost']

classifiers = []
for method in methods:
    for i in np.arange(0.4,0.5,0.1):
        clfObj = CH.buildBinClassifier(data,allAttribs,1-i,80,method)
        clfObj.runClf()
        classifiers.append(clfObj)

In [None]:
# Writing all the scores into a pandas data-frame and then into a CSV file
printableClfs = []

for clf in classifiers:
    printableClfs.append(dict(literal_eval(clf.__str__())))
    
df = pd.DataFrame(printableClfs)
df = df[['methodName','splitPercent','accScore','precision','recall','f1Score','auc','sqerr']]
df.columns = ['Classifier','Train-Test Split','Accuracy','Precision','Recall','F1 score','AUC','Squared Error']
# df.to_csv("../ClassifierResults/extrmClfMetrics_abv_mean.csv",index=False)

In [None]:
# Will take up valuable Plot.ly plots per day. Limited to 50 plots per day.
# changes to file name important
iFrameBlock = []
for i in np.arange(0.4,0.5,0.1):
    df1 = df[(df['Train-Test Split']==1-i)]
    df1.index = df1['Classifier']
    df1 = df1[['Accuracy','Precision','Recall','F1 score','AUC','Squared Error']].transpose()
    df1.iplot(kind='bar',filename=str('Train-Test_Split_Ratio_abv_mean %f' %i),title=str('Train-Test Split Ratio: %f' %i))
    # iFrameBlock.append(fig.embed_code)

# with open("../ClassifierResults/performanceComparisonsparse.html","w") as perf:
#     perf.write(HT.h1("Performance Comparisons of Classifiers with non_sparse Attributes."))
#     for row in iFrameBlock:
#         perf.write(HT.HTML(row))

### Calculating weights of features in the classifiers

In [None]:
clfWeights = []
for clf in classifiers:
    clfAttribs = list(clf.test_x.columns)
    if clf.methodName == 'logistic':
        clfAttribWgts = list(clf.clfObj.coef_[0])
    elif clf.methodName == 'dtree' or clf.methodName == 'random_forests':
        clfAttribWgts = list(clf.clfObj.feature_importances_)
    else:
        continue
        
        
    attribWgt = {clfAttribs[i] : clfAttribWgts[i] for i in range(len(clfAttribs))}
    attribWgt['Method'] = clf.methodName
    attribWgt['Split_Percent'] = clf.splitPercent
        
    clfWeights.append(attribWgt)

In [None]:
clfDf = pd.DataFrame(clfWeights)

In [None]:
indDF = clfDf[(clfDf['Method']=='logistic')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/LogisiticWeights.csv")

indDF = clfDf[(clfDf['Method']=='dtree')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/DecisionTreeWeights.csv")

indDF = clfDf[(clfDf['Method']=='random_forests')]
indDF.index = indDF['Split_Percent']
indDF.drop('Method',1,inplace=True)  
indDF.transpose().to_csv("../ClassifierResults/RandomForestsWeights.csv")

In [None]:
logisticDf = clfDf[(clfDf['Method']=='logistic')]
del logisticDf['Method']
del logisticDf['Split_Percent']
dtreeDf = clfDf[(clfDf['Method']=='dtree')]
del dtreeDf['Method']
del dtreeDf['Split_Percent']
randomForestDf = clfDf[(clfDf['Method']=='random_forests')]
del randomForestDf['Method']
del randomForestDf['Split_Percent']

In [None]:
logisticDf = logisticDf.transpose()
logisticDf.reset_index(inplace=True)
logisticDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_logistic = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    logisticDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = logisticDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_logistic.append(df)
    
concatdf_logisitc = pd.concat([dfs_logistic[0],dfs_logistic[1],dfs_logistic[2],dfs_logistic[3],dfs_logistic[4],dfs_logistic[5],dfs_logistic[6],dfs_logistic[7],dfs_logistic[8]],axis=1)
concatdf_logisitc.to_csv("../ClassifierResults/Top15_Weights_Logisitic.csv")

In [None]:
dtreeDf = dtreeDf.transpose()
dtreeDf.reset_index(inplace=True)
dtreeDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_tree = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    dtreeDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = dtreeDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_tree.append(df)
    
concatdf_dtree = pd.concat([dfs_tree[0],dfs_tree[1],dfs_tree[2],dfs_tree[3],dfs_tree[4],dfs_tree[5],dfs_tree[6],dfs_tree[7],dfs_tree[8]],axis=1)
concatdf_dtree.to_csv("../ClassifierResults/Top15_Weights_Dtree.csv")

In [None]:
randomForestDf = randomForestDf.transpose()
randomForestDf.reset_index(inplace=True)
randomForestDf.columns = ['Feature','10%','20%','30%','40%','50%','60%','70%','80%','90%']
dfs_rndf = []
for i in range(10,100,10):
    prcnt = str(i)+'%'
    randomForestDf.sort_values(by=prcnt,inplace=True,ascending=False)
    df = randomForestDf[['Feature',prcnt]].head(15)
    df.index = np.arange(1,16,1)
    
    dfs_rndf.append(df)
    
concatdf_rndf = pd.concat([dfs_rndf[0],dfs_rndf[1],dfs_rndf[2],dfs_rndf[3],dfs_rndf[4],dfs_rndf[5],dfs_rndf[6],dfs_rndf[7],dfs_rndf[8]],axis=1)
concatdf_rndf.to_csv("../ClassifierResults/Top15_Weights_Rndf.csv")

In [None]:
attribs = [list(dfs_logistic[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [None]:
attribs = [list(dfs_tree[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [None]:
attribs = [list(dfs_rndf[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [None]:
attribs = [list(dfs_logistic[i]['Feature']) for i in range(0,9)]
attribs += [list(dfs_tree[i]['Feature']) for i in range(0,9)]
attribs += [list(dfs_rndf[i]['Feature']) for i in range(0,9)]
attribs = [attrib for listAttrib in attribs for attrib in listAttrib]
pd.DataFrame(Counter(attribs),index=['Frequency']).transpose().sort_values(by=['Frequency'],ascending=False)

In [None]:
logisticDf.sort_values(by='10%',inplace=True,ascending=False)
fig = {
    'data' : [
        {'x' : logisticDf.Feature.head(15),'y' : logisticDf['10%'].head(15), 'mode' : 'markers', 'name' : '10%'}
    ]
}
iplot(fig)

In [None]:
obj1.precision

In [None]:
classifiers[0].preds

In [5]:
data= CH.getMasterData("../FinalResults/ImgShrRnkListWithTags.csv")  
methods = ['dummy','bayesian','logistic','svm','dtree','random_forests','ada_boost']
kwargsDict = {'dummy' : {'strategy' : 'most_frequent'},
            'bayesian' : {'fit_prior' : True},
            'logistic' : {'penalty' : 'l2'},
            'svm' : {'kernel' : 'rbf','probability' : True},
            'dtree' : {'criterion' : 'entropy'},
            'random_forests' : {'n_estimators' : 10 },
            'ada_boost' : {'n_estimators' : 50 }}

In [7]:
allAttribs = CH.genAllAttribs("../FinalResults/ImgShrRnkListWithTags.csv",'non_sparse',"../data/infoGainsExpt2.csv")
clfObj = CH.buildBinClassifier(data,allAttribs,1-0.5,80,'dtree',kwargsDict['dtree'])
clfObj.runClf()

0

In [10]:
clfObj.precision,clfObj.recall,clfObj.methodName

(0.66666666666666663, 0.7068965517241379, 'dtree')

In [9]:
fpr,tpr,_ = clfObj.roccurve
rocCurve = {}
for i in range(len(fpr)):
    rocCurve[fpr[i]] = tpr[i]
    
pd.DataFrame(rocCurve,index=['tpr']).transpose().iplot()

In [121]:
train_data_fl = "../FinalResults/ImgShrRnkListWithTags.csv"
attribType = 'non_zero'
infoGainFl = "../data/infoGainsExpt2.csv"
allAttribs = CH.genAllAttribs(train_data_fl,attribType,infoGainFl)

train_data= CH.getMasterData(train_data_fl)

In [122]:
# CH.buildBinClassifier(train_data,allAttribs,0.0,80,clf,clfArgs.get(clf,None))
# readyDict = CH.createDataFlDict(train_data,allAttribs,80,dataMode='regression',ftrs=['SPECIES','SEX','AGE','QUALITY','VIEW_POINT'])
readyDict = CH.createDataFlDict(train_data,allAttribs,80,dataMode='regression')

In [123]:
df = pd.DataFrame(readyDict).transpose()
dfCol = df.columns
df.reset_index(inplace=True)
df.columns = ['GID'] + list(dfCol)
df['GID'] = df['GID'].apply(pd.to_numeric)

In [124]:
dfResults = pd.DataFrame.from_csv(train_data_fl)['Proportion'].reset_index()

In [125]:
regressionData = pd.merge(df,dfResults,on='GID')
regressionData.drop(['GID'],1,inplace=True)

In [126]:
regressionData.head()

Unnamed: 0,Female,IBEIS_PZ_0001,IBEIS_PZ_0003,IBEIS_PZ_0005,IBEIS_PZ_0007,IBEIS_PZ_0010,IBEIS_PZ_0011,IBEIS_PZ_0012,IBEIS_PZ_0013,IBEIS_PZ_0014,...,tall,tree,unknown,walking,water,way,wild,zebra,zebra_plains,Proportion
0,1.0,,,,,,0.0,,0.0,,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,20.0
1,0.0,,,,,,0.0,,0.0,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,80.0
2,0.0,,,,,,0.0,,0.0,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0
3,0.0,,,,,,0.0,,0.0,,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
4,0.0,,,,,,0.0,,0.0,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,10.0


In [99]:
from sklearn.linear_model import LinearRegression 
from sklearn.cross_validation import train_test_split

In [92]:
rgrObj = LinearRegression(fit_intercept=True)

In [94]:
regressionData.columns

Index(['Female', 'Male', 'UNKNOWN NAME', 'UNKNOWN SEX', 'adult', 'animal',
       'antelope', 'arthropod', 'baby', 'back', 'backleft', 'backright',
       'big cat', 'brown', 'brush', 'bushes', 'cactus', 'conifer', 'crossing',
       'deer', 'desert', 'dirt', 'drink', 'drinking', 'dry', 'eating',
       'elephant', 'excellent', 'field', 'forest', 'front', 'frontleft',
       'frontright', 'giraffe', 'giraffe_masai', 'good', 'grass', 'grassy',
       'grazing', 'green', 'ground', 'group', 'hay', 'herd', 'hill', 'hyena',
       'infant', 'junk', 'juveniles - one year old', 'juveniles- two year old',
       'lake', 'laying', 'leaf', 'left', 'llama', 'lone', 'looking', 'lush',
       'mammal', 'mother', 'mountain', 'ok', 'open', 'outdoor', 'path',
       'plain', 'plant', 'pond', 'poor', 'right', 'river', 'road', 'running',
       'sheep', 'sky', 'standing', 'tall', 'tree', 'unknown', 'walking',
       'water', 'way', 'wild', 'zebra', 'zebra_plains', 'Proportion'],
      dtype='object')

In [95]:
x = regressionData[['Female', 'Male', 'UNKNOWN NAME', 'UNKNOWN SEX', 'adult', 'animal',
       'antelope', 'arthropod', 'baby', 'back', 'backleft', 'backright',
       'big cat', 'brown', 'brush', 'bushes', 'cactus', 'conifer', 'crossing',
       'deer', 'desert', 'dirt', 'drink', 'drinking', 'dry', 'eating',
       'elephant', 'excellent', 'field', 'forest', 'front', 'frontleft',
       'frontright', 'giraffe', 'giraffe_masai', 'good', 'grass', 'grassy',
       'grazing', 'green', 'ground', 'group', 'hay', 'herd', 'hill', 'hyena',
       'infant', 'junk', 'juveniles - one year old', 'juveniles- two year old',
       'lake', 'laying', 'leaf', 'left', 'llama', 'lone', 'looking', 'lush',
       'mammal', 'mother', 'mountain', 'ok', 'open', 'outdoor', 'path',
       'plain', 'plant', 'pond', 'poor', 'right', 'river', 'road', 'running',
       'sheep', 'sky', 'standing', 'tall', 'tree', 'unknown', 'walking',
       'water', 'way', 'wild', 'zebra', 'zebra_plains']]
y = regressionData['Proportion']

In [100]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.4,random_state=0)

In [103]:
rgrObj.fit(train_x,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [107]:
preds = rgrObj.predict(test_x)

In [108]:
testy = list(test_y)

In [115]:
errors = [testy[i]-preds[i] for i in range(len(preds)) if abs(testy[i]-preds[i]) <= 100]

In [118]:
sum(errors)/len(errors)

1.2549046579889751

In [119]:
outliers = [testy[i]-preds[i] for i in range(len(preds)) if abs(testy[i]-preds[i]) > 100]

In [120]:
outliers

[-144222989624324.97,
 -1214150026129.988,
 4091148871.587543,
 253665574945685.31,
 880660374527255.38,
 -501903394479.68677,
 6400542402051.9121,
 -103563922197765.53]