diagnostics.py

import pickle
import pylab as pl
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from lib import helper
from tabulate import tabulate
from progress.bar import ChargingBar

KEYWORD1 = input()
KEYWORD2 = input()
SINCE   = input()
UNTIL   = input()
DBName  = input()
COUNTRY = input()
CACHE   = bool(input())
PORT    = int(input())

figpath = 'app/output/'+DBName
# -----------------------------------------------------------------------------
f = open('data/pickle/'+DBName+'/model.pkl', 'rb')
model = pickle.load(f)
f.close()
# -----------------------------------------------------------------------------
formula = model['formula']
y = model['y']
X = model['X']
result = model['result']

print('Diagnostics...')
if result.llr_pvalue<0.05:
    # print(dir(result))
    ypred = result.predict(X)
    p = len(list(set(z.rstrip().lstrip() for x in formula.split('~')[1].split('+') for y in x.split(':') for z in y.split('*'))))
    n = result.nobs
    y = np.array(y['text'])
    X = np.matrix(X)
    H = X * np.linalg.inv(np.transpose(X) * X) * np.transpose(X)
    h = np.diag(H)
    rpear = result.resid_pearson
    rdev = result.resid_dev
    e = y-ypred
    D = (rpear**2)*h/(2*(1-h))
    cut_off = round(2*(p+1)/n, 5)
    bar = ChargingBar('Processing', max=4)
    bar.start()
    pl.figure()
    pl.grid(True)
    pl.scatter(np.arange(len(rpear)), rpear)
    pl.xlabel('Index')
    pl.ylabel('Pearson\'s Residual')
    pl.title('Index plot of pearson\'s residual')
    pl.savefig(figpath+'/Fig-1.13.png')
    # pl.show()
    pl.close()
    bar.next()

    pl.figure()
    pl.grid(True)
    pl.scatter(np.arange(len(rdev)), rdev)
    pl.xlabel('Index')
    pl.ylabel('Deviance Residual')
    pl.title('Index plot of deviance residual')
    pl.savefig(figpath+'/Fig-1.14.png')
    # pl.show()
    pl.close()
    bar.next()

    pl.figure()
    pl.grid(True)
    pl.plot(np.arange(len(h)), h)
    pl.axhline(y=cut_off, color='red')
    pl.xlabel('Index')
    pl.ylabel('Hat values')
    pl.title('Plot for identifying cases of high leverage')
    pl.savefig(figpath+'/Fig-1.15.png')
    # pl.show()
    pl.close()
    bar.next()

    pl.figure()
    pl.grid(True)
    pl.plot(np.arange(len(D)), D)
    pl.xlabel('Index')
    pl.ylabel('Cook\'s Distance')
    pl.title('Plot for identifying cases of high influence')
    pl.savefig(figpath+'/Fig-1.16.png')
    # pl.show()
    pl.close()
    bar.next()
    bar.finish()

    print('Re-opening the report markdown file ...')
    report = open(figpath+'/'+DBName+'_report3.md','w')
    reportpath = 'output/'+DBName
    print('Appending into Report file...')
    report.write('''
####5. Diagnostics
Dependent variable (y), predicted values of y(ypred), error terms (e), hat values (h),
pearson's residual values (rpear), deviance residual values (rdev) and, cook's distance
values (D) are clubed together for closer investigation. Below is a small portion of
the data frame generated by clubbing these values:

<center><sub>Table 6. For Diagnostics summary for sample dataset</sub></center>

<center>
''')
    high_leverage = list(np.where(h>cut_off)[0].tolist())
    high_influence = list(np.where(D>10)[0].tolist())
    diagnostics = pd.DataFrame({'y': y, 'ypred': ypred, 'e': e, 'h': h, 'rpear': rpear, 'rdev': rdev, 'D': D}, columns=['y','ypred','e','h','rpear','rdev','D'])
    sample = []
    for i in diagnostics.head().index:
        sample.append({'y': diagnostics.ix[i,'y'],
                       'ypred': diagnostics.ix[i,'ypred'],
                       'e': diagnostics.ix[i,'e'],
                       'h': diagnostics.ix[i,'h'],
                       'rpear': diagnostics.ix[i,'rpear'],
                       'rdev': diagnostics.ix[i,'rdev'],
                       'D': diagnostics.ix[i,'D'],
                      })
    report.write(tabulate(sample, headers='keys', tablefmt='html'))
    report.write('''
</center>

<center>![Index plot of pearson\'s residual](http://localhost:8080/'''+reportpath+'''/Fig-1.13.png "Index plot of pearson\'s residual")</center>
<center><sub>Fig 1.13 Index plot of pearson\'s residual</sub></center>

<center>![Index plot of deviance residual](http://localhost:8080/'''+reportpath+'''/Fig-1.10.png "Index plot of deviance residual")</center>
<center><sub>Fig 1.14 Index plot of deviance residual</sub></center>

Since, the pearson's residual and deviance residual is randomly scattered, having bands of rectangular cloud.
So, the systematic component of the model is correct.

The following graphs show the cases of high leverage and cases of high influence.

<center>![Plot for identifying cases of high leverage](http://localhost:8080/'''+reportpath+'''/Fig-1.15.png "Plot for identifying cases of high leverage")</center>
<center><sub>Fig 1.15 Plot for identifying cases of high leverage</sub></center>

Fig 1.15 above shows the cases of high leverage. The values above the cut off '''+str(round(cut_off,2))+'''point marked in red are the cases of high leverage.
In this case, we have '''+str(len(high_leverage))+''' cases of high leverage.
''')
    if len(high_leverage)<=5:
        report.write('''
They are indexed as '''+', '.join(str(a) for a in high_leverage)+''' in the dataset.
        ''')
    report.write('''
<center>![Plot for identifying cases of high influence](http://localhost:8080/'''+reportpath+'''/Fig-1.16.png "Plot for identifying cases of high influence")</center>
<center><sub>Fig 1.16 Plot for identifying cases of high influence</sub></center>

The Fig 1.16 shows the cases of high influence. In this case, we have '''+str(len(high_influence))+''' cases of high influence
''')
    if len(high_influence)<=5:
        report.write('''
and they are indexed as '''+ ', '.join(str(a) for a in high_influence) +''' in the dataset
        ''')
    report.write('''.
####6. Accuracy

The accuracy of the model is obtained by considering the predicted values above 0.5 as '''+ KEYWORD2 +''' while,
values below or equal to 0.5 as '''+ KEYWORD1 +'''. A confusion matrix is plotted in order to obtain the overall accuracy of the model.
The following table shows the contigency/confusion matrix:
<br/><br/><br/>
''')
    ypred_nominal = [ 1 if yp > 0.5 else 0 for yp in ypred]
    mat = np.matrix(confusion_matrix(y, ypred_nominal))
    report.write(''''

<center><sub>Table 7. Confusion Matrix for the Model</sub></center>

<center>
<table>
<tr>
    <th rowspan='3' style='text-align:center'> Actual</th>
    <th colspan='3' style='text-align:center'>Predicted</th>
    <th>Total</th></tr>
<tr>
    <td> '''+ KEYWORD1 +''' </td>
    <td> '''+ str(mat[0,0]) +''' </td>
    <td> '''+ str(mat[0,1]) +''' </td>
    <td> '''+ str(np.sum(mat, axis=1)[0,0]) +''' </td>
</tr>
<tr>
    <td> '''+ KEYWORD2 +''' </td>
    <td> '''+ str(mat[1,0]) +''' </td>
    <td> '''+ str(mat[1,1]) +''' </td>
    <td> '''+ str(np.sum(mat, axis=1)[1,0]) +''' </td>
</tr>
<tr>
    <th colspan='2' style='text-align:right; margin-right:5px;'>Total</th>
    <td> '''+ str(np.sum(mat, axis=0)[0,0]) +''' </td>
    <td> '''+ str(np.sum(mat, axis=0)[0,1]) +''' </td>
    <td> '''+ str(np.sum(mat)) +''' </td>
</tr>
</table>
</center>

So, the model is accurately able to identify '''+ str(round(mat[0,0]/np.sum(mat, axis=1)[0,0]*100, 2)) +'''%
of the results for '''+KEYWORD1+''' and '''+ str(round(mat[1,1]/np.sum(mat, axis=1)[1,0]*100, 2)) +'''%
for '''+KEYWORD2+'''. Also, the overall accuracy of the model is '''+ str(round((mat[0,0]+mat[1,1])/np.sum(mat)*100, 2)) +'''%.
So, we can say that the model performs ''')
    r1 = round(mat[0,0]/np.sum(mat, axis=1)[0,0]*100, 2)
    r2 = round(mat[1,1]/np.sum(mat, axis=1)[1,0]*100, 2)
    ov = round((mat[0,0]+mat[1,1])/np.sum(mat)*100, 2)
    if r1<50:
        report.write('pretty badly ')
    elif r1>=50 and r1<70:
        report.write('fairly ')
    else:
        report.write('pretty good ')
    report.write('''in predicting the accuracy of '''+ KEYWORD1 +''' and ''')
    if r2<50:
        report.write('also does pretty badly ' if r1<50 else 'performs bad ')
    elif r2>=50 and r2<70:
        report.write('also does fairly well ' if r1>=50 and r1<70 else 'performs fairly well ')
    else:
        report.write('also does perform good ' if r1>=70 else 'performs good ')
    report.write('while predicting the accuracy of '+ KEYWORD2+'.')
    if r1>70 and r2>70:
        report.write('''
Consequently, it can be said that '''+ str(round(ov*mat[0,0]/np.sum(mat),2)) +'''%
of the people are sure to vote for '''+KEYWORD1+''' and, '''+ str(round(ov*mat[1,1]/np.sum(mat),2)) +'''%
of the people are sure to vote for '''+KEYWORD2+'''.
''')
    elif r1>70:
        report.write('''
Consequently, it can be said that '''+ str(round(ov*mat[0,0]/np.sum(mat),2)) +'''%
of the people are sure to vote for '''+KEYWORD1+'''.
''')
    elif r2>70:
        report.write('''
Consequently, it can be said that '''+ str(round(ov*mat[1,1]/np.sum(mat),2)) +'''%
of the people are sure to vote for '''+KEYWORD2+'''.
''')
    report.close()