In [24]:
from ClassiferHelperAPI import trainTestClf, trainTestRgrs
import importlib, ClassiferHelperAPI, numpy as np
importlib.reload(ClassiferHelperAPI)
import json
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from plotly import tools
import plotly.plotly as py
import cufflinks as cf
cf.go_online()

In [22]:
clfArgs = {'dummy' : {'strategy' : 'most_frequent'},
                'bayesian' : {'fit_prior' : True},
                'logistic' : {'penalty' : 'l2'},
                'svm' : {'kernel' : 'rbf','probability' : True},
                'dtree' : {'criterion' : 'entropy'},
                'random_forests' : {'n_estimators' : 10 },
                'ada_boost' : {'n_estimators' : 50 }}

regrArgs = {'linear' : {'fit_intercept' : True},
            'ridge' : {'fit_intercept' : True},
            'lasso' : {'fit_intercept' : True},
            'elastic_net' : {'fit_intercept' : True},
            'svr' : {'fit_intercept' : True},
            'dtree_regressor' : {'fit_intercept' : True}}

train_fl = "../data/BeautyFtrVector_GZC_Expt2.csv"
test_fl = "../data/Flickr_Scrapes_Ftrs.csv"

In [50]:
with open("../data/Flickr_FL_URL_map.json", "r") as fl_url_map_fl:
    fl_url_map = json.load(fl_url_map_fl)

rgrTypes = ['linear', 'ridge', 'lasso', 'elastic_net', 'svr', 'dtree_regressor']
clfTypes = ['bayesian', 'logistic', 'svm', 'dtree', 'random_forests', 'ada_boost']
attrib = 'beauty'

## Results using Regression
* Build a dictionary of files and the respective predictions
* Build a report with all features, predicted share rate and the actual image
* Plot: Cluster of share rates

In [71]:
results = {}
for meth in rgrTypes:
    methObj,predResults = trainTestRgrs(train_fl,
                                    test_fl,
                                    meth,
                                    attrib,
                                    infoGainFl=None,
                                    methArgs = regrArgs
                                    )
    results[meth] = dict(obj = methObj, pred_results = predResults)

for meth in rgrTypes:
    pred_results = results[meth]['pred_results']
    y = list(pred_results.values())
    x = list(range(1,len(y)+1))
    
    layout= go.Layout(
                title= "Share rate distributions using %s" %meth,
                showlegend=False,
                xaxis= dict(
                    title= 'Images (n)',
                    ticklen= 5,
                    zeroline= True,
                    gridwidth= 2
                ),
                yaxis=dict(
                    title= 'Predicted Share rates',
                    ticklen= 5,
                    gridwidth= 2,
                    #range=range
                )
            )

    trace1 = go.Scatter(
                    x = x,
                    y = y,
                    mode = 'markers'
            )


    data = [trace1]

    fig = dict(data=data,layout=layout)
    figMain = py.iplot(fig,filename= 'Visual for distribution of predicted share rates using %s' %meth)
    print(figMain.embed_code)

Number of outliers identified: 604
1097 1097
Number of outliers identified: 615
1086 1086
Number of outliers identified: 0
1701 1701
Number of outliers identified: 0
1701 1701
Number of outliers identified: 0
1701 1701
Number of outliers identified: 0
1701 1701
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/615.embed" height="525px" width="100%"></iframe>
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/617.embed" height="525px" width="100%"></iframe>
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/619.embed" height="525px" width="100%"></iframe>
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/621.embed" height="525px" width="100%"></iframe>
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/623.embed" h

## Results using Classifiers

In [73]:
results = {}
for meth in clfTypes:
    methObj,predResults = trainTestClf(train_fl,
                                    test_fl,
                                    meth,
                                    attrib,
                                    infoGainFl=None,
                                    methArgs = clfArgs
                                    )
    results[meth] = dict(obj = methObj, pred_results = predResults)

for meth in clfTypes:
    pred_results = results[meth]['obj'].predProbabs
    y = list(pred_results)
    x = list(range(1,len(y)+1))
    
    layout= go.Layout(
                title= "Prediction probability distributions using %s" %meth,
                showlegend=False,
                xaxis= dict(
                    title= 'Images (n)',
                    ticklen= 5,
                    zeroline= True,
                    gridwidth= 2
                ),
                yaxis=dict(
                    title= 'Predicted Share/No-Share probabilities',
                    ticklen= 5,
                    gridwidth= 2,
                    #range=range
                )
            )

    trace1 = go.Scatter(
                    x = x,
                    y = y,
                    mode = 'markers'
            )


    data = [trace1]

    fig = dict(data=data,layout=layout)
    figmain = py.iplot(fig,filename= 'Visual for distribution of predicted share no-share probabilities using %s' %meth)
    print(figmain.embed_code)

<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/628.embed" height="525px" width="100%"></iframe>
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/630.embed" height="525px" width="100%"></iframe>
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/632.embed" height="525px" width="100%"></iframe>
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/634.embed" height="525px" width="100%"></iframe>
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/636.embed" height="525px" width="100%"></iframe>
<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/638.embed" height="525px" width="100%"></iframe>


## Input Distribution

In [83]:
pred_results = results['linear']['obj'].train_y
y = list(pred_results)
x = list(range(1,len(y)+1))
    
layout= go.Layout(
                title= "Training data distribution",
                showlegend=False,
                xaxis= dict(
                    title= 'Images (n)',
                    ticklen= 5,
                    zeroline= True,
                    gridwidth= 2
                ),
                yaxis=dict(
                    title= 'Share proportion',
                    ticklen= 5,
                    gridwidth= 2,
                    #range=range
                )
            )

trace1 = go.Scatter(
                    x = x,
                    y = y,
                    mode = 'markers'
            )

data = [trace1]
fig = dict(data=data,layout=layout)
fig = py.iplot(fig,filename="Expt2 Training data distributions")
fig.embed_code

'<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~smenon8/640.embed" height="525px" width="100%"></iframe>'

In [79]:
results = {}
for meth in rgrTypes:
    methObj,predResults = trainTestRgrs(train_fl,
                                    test_fl,
                                    meth,
                                    attrib,
                                    infoGainFl=None,
                                    methArgs = regrArgs
                                    )
    results[meth] = dict(obj = methObj, pred_results = predResults)

Number of outliers identified: 604
1097 1097
Number of outliers identified: 615
1086 1086
Number of outliers identified: 0
1701 1701
Number of outliers identified: 0
1701 1701
Number of outliers identified: 0
1701 1701
Number of outliers identified: 0
1701 1701
