-
Notifications
You must be signed in to change notification settings - Fork 0
/
diagnostics.py
234 lines (214 loc) · 8.81 KB
/
diagnostics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import pickle
import pylab as pl
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from lib import helper
from tabulate import tabulate
from progress.bar import ChargingBar
KEYWORD1 = input()
KEYWORD2 = input()
SINCE = input()
UNTIL = input()
DBName = input()
COUNTRY = input()
CACHE = bool(input())
PORT = int(input())
figpath = 'app/output/'+DBName
# -----------------------------------------------------------------------------
f = open('data/pickle/'+DBName+'/model.pkl', 'rb')
model = pickle.load(f)
f.close()
# -----------------------------------------------------------------------------
formula = model['formula']
y = model['y']
X = model['X']
result = model['result']
print('Diagnostics...')
if result.llr_pvalue<0.05:
# print(dir(result))
ypred = result.predict(X)
p = len(list(set(z.rstrip().lstrip() for x in formula.split('~')[1].split('+') for y in x.split(':') for z in y.split('*'))))
n = result.nobs
y = np.array(y['text'])
X = np.matrix(X)
H = X * np.linalg.inv(np.transpose(X) * X) * np.transpose(X)
h = np.diag(H)
rpear = result.resid_pearson
rdev = result.resid_dev
e = y-ypred
D = (rpear**2)*h/(2*(1-h))
cut_off = round(2*(p+1)/n, 5)
bar = ChargingBar('Processing', max=4)
bar.start()
pl.figure()
pl.grid(True)
pl.scatter(np.arange(len(rpear)), rpear)
pl.xlabel('Index')
pl.ylabel('Pearson\'s Residual')
pl.title('Index plot of pearson\'s residual')
pl.savefig(figpath+'/Fig-1.13.png')
# pl.show()
pl.close()
bar.next()
pl.figure()
pl.grid(True)
pl.scatter(np.arange(len(rdev)), rdev)
pl.xlabel('Index')
pl.ylabel('Deviance Residual')
pl.title('Index plot of deviance residual')
pl.savefig(figpath+'/Fig-1.14.png')
# pl.show()
pl.close()
bar.next()
pl.figure()
pl.grid(True)
pl.plot(np.arange(len(h)), h)
pl.axhline(y=cut_off, color='red')
pl.xlabel('Index')
pl.ylabel('Hat values')
pl.title('Plot for identifying cases of high leverage')
pl.savefig(figpath+'/Fig-1.15.png')
# pl.show()
pl.close()
bar.next()
pl.figure()
pl.grid(True)
pl.plot(np.arange(len(D)), D)
pl.xlabel('Index')
pl.ylabel('Cook\'s Distance')
pl.title('Plot for identifying cases of high influence')
pl.savefig(figpath+'/Fig-1.16.png')
# pl.show()
pl.close()
bar.next()
bar.finish()
print('Re-opening the report markdown file ...')
report = open(figpath+'/'+DBName+'_report3.md','w')
reportpath = 'output/'+DBName
print('Appending into Report file...')
report.write('''
####5. Diagnostics
Dependent variable (y), predicted values of y(ypred), error terms (e), hat values (h),
pearson's residual values (rpear), deviance residual values (rdev) and, cook's distance
values (D) are clubed together for closer investigation. Below is a small portion of
the data frame generated by clubbing these values:
<center><sub>Table 6. For Diagnostics summary for sample dataset</sub></center>
<center>
''')
high_leverage = list(np.where(h>cut_off)[0].tolist())
high_influence = list(np.where(D>10)[0].tolist())
diagnostics = pd.DataFrame({'y': y, 'ypred': ypred, 'e': e, 'h': h, 'rpear': rpear, 'rdev': rdev, 'D': D}, columns=['y','ypred','e','h','rpear','rdev','D'])
sample = []
for i in diagnostics.head().index:
sample.append({'y': diagnostics.ix[i,'y'],
'ypred': diagnostics.ix[i,'ypred'],
'e': diagnostics.ix[i,'e'],
'h': diagnostics.ix[i,'h'],
'rpear': diagnostics.ix[i,'rpear'],
'rdev': diagnostics.ix[i,'rdev'],
'D': diagnostics.ix[i,'D'],
})
report.write(tabulate(sample, headers='keys', tablefmt='html'))
report.write('''
</center>
<center>![Index plot of pearson\'s residual](http://localhost:8080/'''+reportpath+'''/Fig-1.13.png "Index plot of pearson\'s residual")</center>
<center><sub>Fig 1.13 Index plot of pearson\'s residual</sub></center>
<center>![Index plot of deviance residual](http://localhost:8080/'''+reportpath+'''/Fig-1.10.png "Index plot of deviance residual")</center>
<center><sub>Fig 1.14 Index plot of deviance residual</sub></center>
Since, the pearson's residual and deviance residual is randomly scattered, having bands of rectangular cloud.
So, the systematic component of the model is correct.
The following graphs show the cases of high leverage and cases of high influence.
<center>![Plot for identifying cases of high leverage](http://localhost:8080/'''+reportpath+'''/Fig-1.15.png "Plot for identifying cases of high leverage")</center>
<center><sub>Fig 1.15 Plot for identifying cases of high leverage</sub></center>
Fig 1.15 above shows the cases of high leverage. The values above the cut off '''+str(round(cut_off,2))+'''point marked in red are the cases of high leverage.
In this case, we have '''+str(len(high_leverage))+''' cases of high leverage.
''')
if len(high_leverage)<=5:
report.write('''
They are indexed as '''+', '.join(str(a) for a in high_leverage)+''' in the dataset.
''')
report.write('''
<center>![Plot for identifying cases of high influence](http://localhost:8080/'''+reportpath+'''/Fig-1.16.png "Plot for identifying cases of high influence")</center>
<center><sub>Fig 1.16 Plot for identifying cases of high influence</sub></center>
The Fig 1.16 shows the cases of high influence. In this case, we have '''+str(len(high_influence))+''' cases of high influence
''')
if len(high_influence)<=5:
report.write('''
and they are indexed as '''+ ', '.join(str(a) for a in high_influence) +''' in the dataset
''')
report.write('''.
####6. Accuracy
The accuracy of the model is obtained by considering the predicted values above 0.5 as '''+ KEYWORD2 +''' while,
values below or equal to 0.5 as '''+ KEYWORD1 +'''. A confusion matrix is plotted in order to obtain the overall accuracy of the model.
The following table shows the contigency/confusion matrix:
<br/><br/><br/>
''')
ypred_nominal = [ 1 if yp > 0.5 else 0 for yp in ypred]
mat = np.matrix(confusion_matrix(y, ypred_nominal))
report.write(''''
<center><sub>Table 7. Confusion Matrix for the Model</sub></center>
<center>
<table>
<tr>
<th rowspan='3' style='text-align:center'> Actual</th>
<th colspan='3' style='text-align:center'>Predicted</th>
<th>Total</th></tr>
<tr>
<td> '''+ KEYWORD1 +''' </td>
<td> '''+ str(mat[0,0]) +''' </td>
<td> '''+ str(mat[0,1]) +''' </td>
<td> '''+ str(np.sum(mat, axis=1)[0,0]) +''' </td>
</tr>
<tr>
<td> '''+ KEYWORD2 +''' </td>
<td> '''+ str(mat[1,0]) +''' </td>
<td> '''+ str(mat[1,1]) +''' </td>
<td> '''+ str(np.sum(mat, axis=1)[1,0]) +''' </td>
</tr>
<tr>
<th colspan='2' style='text-align:right; margin-right:5px;'>Total</th>
<td> '''+ str(np.sum(mat, axis=0)[0,0]) +''' </td>
<td> '''+ str(np.sum(mat, axis=0)[0,1]) +''' </td>
<td> '''+ str(np.sum(mat)) +''' </td>
</tr>
</table>
</center>
So, the model is accurately able to identify '''+ str(round(mat[0,0]/np.sum(mat, axis=1)[0,0]*100, 2)) +'''%
of the results for '''+KEYWORD1+''' and '''+ str(round(mat[1,1]/np.sum(mat, axis=1)[1,0]*100, 2)) +'''%
for '''+KEYWORD2+'''. Also, the overall accuracy of the model is '''+ str(round((mat[0,0]+mat[1,1])/np.sum(mat)*100, 2)) +'''%.
So, we can say that the model performs ''')
r1 = round(mat[0,0]/np.sum(mat, axis=1)[0,0]*100, 2)
r2 = round(mat[1,1]/np.sum(mat, axis=1)[1,0]*100, 2)
ov = round((mat[0,0]+mat[1,1])/np.sum(mat)*100, 2)
if r1<50:
report.write('pretty badly ')
elif r1>=50 and r1<70:
report.write('fairly ')
else:
report.write('pretty good ')
report.write('''in predicting the accuracy of '''+ KEYWORD1 +''' and ''')
if r2<50:
report.write('also does pretty badly ' if r1<50 else 'performs bad ')
elif r2>=50 and r2<70:
report.write('also does fairly well ' if r1>=50 and r1<70 else 'performs fairly well ')
else:
report.write('also does perform good ' if r1>=70 else 'performs good ')
report.write('while predicting the accuracy of '+ KEYWORD2+'.')
if r1>70 and r2>70:
report.write('''
Consequently, it can be said that '''+ str(round(ov*mat[0,0]/np.sum(mat),2)) +'''%
of the people are sure to vote for '''+KEYWORD1+''' and, '''+ str(round(ov*mat[1,1]/np.sum(mat),2)) +'''%
of the people are sure to vote for '''+KEYWORD2+'''.
''')
elif r1>70:
report.write('''
Consequently, it can be said that '''+ str(round(ov*mat[0,0]/np.sum(mat),2)) +'''%
of the people are sure to vote for '''+KEYWORD1+'''.
''')
elif r2>70:
report.write('''
Consequently, it can be said that '''+ str(round(ov*mat[1,1]/np.sum(mat),2)) +'''%
of the people are sure to vote for '''+KEYWORD2+'''.
''')
report.close()