-
Notifications
You must be signed in to change notification settings - Fork 4
/
02_EDA_MisingValue_Treatment_VariableImportance_Testing_Feature_Engg_code_pandasprofiling.py
487 lines (385 loc) · 20.2 KB
/
02_EDA_MisingValue_Treatment_VariableImportance_Testing_Feature_Engg_code_pandasprofiling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
import random
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import pylab
import seaborn as sns
from pandas.plotting import scatter_matrix #from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score, auc,roc_curve
import pandas_profiling
from scipy import stats
X_train = pd.read_csv("/media/sohom/X_train.csv")
y_train = pd.read_csv("/media/sohom/y_train.csv")
X_test = pd.read_csv("/media/sohom/X_validation.csv")
#DATA# X_Train.csv
#Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14,N15,N16,N17,N18,N19,N20,N21,N22,N23,N24,N25,N26,N27,N28,N29,N30,N31,N32,N33,N34,N35
#Candidate_5926,1,0,11,31,0,FALSE,0,TRUE,23.75,,2.5,,,2.595,10,0,0,2,,,14,,0,,,,,,,,27.816,1750,,,,,,,,,58,113.39,12
#Candidate_48134,1,4,2,66,2,FALSE,1,TRUE,11.05,22,3.7,16,12,3.795,19,4,72,0,9,0,5,0,0,0,1944,0.06,25856,17,0.88,1,40,10833.33333,,,,,,,,,160,262.1,17
#DATA# y_train.csv
#Unique_ID,Dependent_Variable
#Candidate_5926,1
#Candidate_48134,0
#DATA# X_validation.csv
#Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14,N15,N16,N17,N18,N19,N20,N21,N22,N23,N24,N25,N26,N27,N28,N29,N30,N31,N32,N33,N34,N35
#Candidate_17537,1,4,6,41,2,FALSE,4,TRUE,19,29,2.9,10,8,2.995,15,5,77,0,4,0,1,0,0,0,2550,1.05,39,11,0.9,2,60,3033.333333,1,8,8,0,0,2551,2126.51,40,51.02,93.51,12
#Candidate_21230,1,7,22,1,1,FALSE,4,TRUE,30.58,2,3.4,0,0,3.495,13,0,0,0,1,0,1,0,0,0,0,0,0,9,0.88,0,12,2916.666667,,,,,,,,,80,171.08,18
train = X_train
train['Dependent_Variable'] = y_train['Dependent_Variable']
test = X_test
train_test = train.append(test)
train_test.describe()
train_test.dtypes
#####Variable Importances
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
##EDA
##Reference: https://www.datacamp.com/community/tutorials/exploratory-data-analysis-python
##Reference: https://towardsdatascience.com/visualizing-your-exploratory-data-analysis-d2d6c2e3b30e
##Reference: http://www.stat.cmu.edu/~hseltman/309/Book/chapter4.pdf
#EDA Objectives:
#1) detection of mistakes in data - Missing value Analysis and Treatment
#2) checking of assumptions - Making hypothesis and checking them
#3) determining relationships among the explanatory variables, and
#4) assessing the direction and rough size of relationships between explanatory and outcome variables.
#5) preliminary selection of appropriate models
###Number/Percentage of missing value in train and test
pd.isnull(train).sum()
pd.isnull(test).sum()
######Missing value treatment; Checking distribution / plots before and after missing value treatment
#Method-1: Deletion
#Method-2: Mean/ Mode/ Median Imputation
#Method-3: Prediction Model
#Method-4: KNN Imputation
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy="mean", axis=0)
#strategy: "mean" or "median" or "most_frequent"
train['N30_missing_imputed'] = imp.fit_transform(train['N30'].values.reshape(-1,1))
imp.fit_transform(train.iloc[:,1:]) #Removing first column as it is a text variable
#Reference: https://pypi.python.org/pypi/fancyimpute/0.0.4
#pip3 install fancyimpute
#ONLY NUMERIC VALUES
from fancyimpute import NuclearNormMinimization, KNN, MICE
solver = NuclearNormMinimization(min_value=0.0, max_value=1.0, error_tolerance=0.0005)
X_filled = solver.complete(train['N30'].values.reshape(-1,1))
X_filled = solver.complete(train)
X_filled_knn = KNN(k=3).complete(train)
#https://github.com/hammerlab/fancyimpute/blob/master/fancyimpute/mice.py
X_filled_mice = MICE().complete(train.as_matrix())
X_filled_mice_df = pd.DataFrame(X_filled_mice)
X_filled_mice_df.columns = train.columns
X_filled_mice_df.index = train.index
#Other methods: SimpleFill, SoftImpute, IterativeSVD, MICE, MatrixFactorization, NuclearNormMinimization, KNN, BiScaler
#SimpleFill: uses mean or median; SoftImpute: Matrix completion;
###Smote
#Only numeric/boolean and non_null values as input to TSNE model :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train_new, y_train_new = sm.fit_sample(train.dropna().iloc[:,1:44], train.dropna()['Dependent_Variable'])
#####Check if sample is representing the population: Central Limit Theorem, https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
#Hyothesis testing, Degree of Freedom, t-statistics student t-test etc.
#References: http://www.scipy-lectures.org/packages/statistics/index.html#pairplot-scatter-matrices
#scipy.stats.ttest_1samp() tests if the population mean of data is likely to be equal to a given value (technically if observations are drawn from a Gaussian distributions of given population mean). It returns the T statistic, and the p-value
##1-sample ttest
stats.ttest_1samp(data['VIQ'], 0)
stats.ttest_1samp(train['N32'].dropna(), 0)
#2-sample ttest
female_viq = data[data['Gender'] == 'Female']['VIQ']
male_viq = data[data['Gender'] == 'Male']['VIQ']
stats.ttest_ind(female_viq, male_viq)
#paired ttest
stats.ttest_ind(data['FSIQ'], data['PIQ'])
stats.ttest_1samp(data['FSIQ'] - data['PIQ'], 0)
#Skewness and kurtosis
#Skewness is a measure of asymmetry. Kurtosis is a more subtle measure of peakedness compared to a Gaussian distribution.
from scipy.stats import kurtosis, skew
kurtosis(train['N35'].notnull())
skew(train['N35'].notnull())
#Cross-tabulation #For generally for categorical_variables
pd.crosstab(df.col_1, [df.col_2, df.col_3], rownames=['col_1'], colnames=['col_2', 'col_3'])
#pip3 install pandas-profiling #Remember only X vaiables of training set is to be given as input in pandas profiling
#pandas_profiling.ProfileReport(X_train) #For IPython Notebook
profile = pandas_profiling.ProfileReport(X_train)
rejected_variables = profile.get_rejected_variables(threshold=0.9)
profile.to_file(outputfile="pandas_profiling_output_train.html")
##Univariate Plot: Variable-1 vs its distribution plot AND min, max, mode, standard_deviation, quantiles
train_test['N35'].std()
train_test['N35'].quantile([.25, .5, .75])
bins = np.arange(train_test['N35'].min(), train_test['N35'].max(), 5)
#plt.hist(train_test[train_test['N35'].isnull()==False]['N35'], bins = bins, alpha = 1)
plt.hist(train_test[train_test['N35'].notnull()]['N35'], bins = bins, alpha = 1)
plt.show()
#Distributional Plots
import seaborn as sns
sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))
sns.distplot(train_test['N35'].dropna())
##Bivariate Plot: Variable-1 vs Target variable
#Scatter Plot
sns.lmplot('N35','Dependent_Variable', fit_reg = False, data=train, size = 8)
sns.plt.show()
##Multivariate Plot: Variable-1 vs Variable-2 :: Explore relationships between variables
#Heat map to see correlation
f, ax = plt.subplots(figsize=(121,121))
sns.heatmap(train.corr(),annot=True)
plt.show()
train.plot(x = 'N35', y = ['N34', 'N33'], subplots=True)
plt.show()
#For continuous variables N33, N34, N35; only numeric values give as input and no null :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING
sns.pairplot(train.dropna(), vars=['N33', 'N34', 'N35'], kind='reg')
plt.show()
#For continuous variables N33, N34, N35 and categorical variable C6 :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING
seaborn.pairplot(train.dropna(), vars=['N33', 'N34', 'N35'], kind='reg', hue='C6')
plt.show()
scatter_matrix(train[features],diagonal='hist')
plt.show()
##Clustering of similar observations in the dataset into differentiated groupings, which by collapsing the data into a few small data points, patterns of behavior can be more easily identified
##K-means : plot
#Remember only numeric values give as input and no null :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING
from sklearn.cluster import KMeans
y_pred = KMeans(n_clusters=2, random_state=40).fit_predict(train.iloc[:,10:43].dropna())
plt.scatter(train.iloc[:,10:43].dropna().iloc[:, 0], train.iloc[:,10:43].dropna().iloc[:, 1], c=y_pred)
plt.show()
##Hierarchical_Clustering : Dendogram plot of features
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(train.iloc[:,10:43].dropna())
linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
fig, ax = plt.subplots(figsize=(100, 100),dpi=100) # set size
#ax = dendrogram(linkage_matrix, orientation="right", labels=features);
ax = dendrogram(linkage_matrix, orientation="right")
plt.tick_params(
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
plt.tight_layout() #show plot with tight layout
plt.show()
#uncomment below to save figure
plt.savefig('ward_clusters_roles.png', dpi=100)
##Dimensionality Reduction :
#PCA
#Correlation identification with matplotlib
# Import `PCA` from `sklearn.decomposition`
from sklearn.decomposition import PCA
# Build the model
pca = PCA(n_components=2)
# Reduce the data, output is ndarray; #Only numeric and non_null values as input to PCA :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING
reduced_data = pca.fit_transform(X_train.loc[:,X_train.columns != 'Unique_ID'].dropna())
# Inspect shape of the `reduced_data`
reduced_data.shape
# print out the reduced data
print(reduced_data)
plt.scatter(reduced_data[:,0], reduced_data[:,1],cmap = 'viridis')
plt.xlabel("dimension1")
plt.ylabel("dimension2")
plt.show()
#Dimensionality Reduction T-SNE
from sklearn.manifold import TSNE
model_tsne = TSNE(n_components=2, random_state=0)
#Only numeric and non_null values as input to TSNE model :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING
model_2d = model_tsne.fit_transform(X_train.loc[:,X_train.columns != 'Unique_ID'].dropna())
plt.scatter(model_2d[:,0],model_2d[:,1])
plt.legend(loc='lower left',fontsize=8)
plt.xlabel("dimension1")
plt.ylabel("dimension2")
plt.show(block=True)
########Box plot
#Source: https://www.kaggle.com/xchmiao/eda-with-python
#Reference: https://seaborn.pydata.org/generated/seaborn.boxplot.html
sns.boxplot(data=train, x = 'N34')
#Draw a vertical boxplot of a continuous variable y i.e. total_bill, grouped by a categorical variable x i.e. day:
sns.boxplot(x="day", y="total_bill", data=tips)
plt.figure(figsize=(10, 5))
plt.show()
###REMOVE OUTLIERS BASED ON THIS BOX-PLOT
#Suppose lower limit decide is 30 and upper limit decided is 80 from boxplot
train = train[(train.N34)>30 & (train.N34 <80)]
mean = np.mean(train.N34, axis=0)
sd = np.std(train.N34, axis=0)
train = train[(train.N34 > mean - 2* sd) & (train.N34 < mean + 2* sd)]
'''
########Normalize: max, min scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train_test.loc[:,train_test.columns != 'Unique_ID'].dropna())
X_train_new = scaler.transform(X_train.loc[:,X_train.columns != 'Unique_ID'].dropna())
X_test_new = scaler.transform(X_test.loc[:,X_test.columns != 'Unique_ID'].dropna())
#Number of columns in train_test and X_test should match
########Normalize: standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_test.loc[:,train_test.columns != 'Unique_ID'].dropna())
X_train_new = scaler.transform(X_train.loc[:,X_train.columns != 'Unique_ID'].dropna())
X_test_new = scaler.transform(X_test.loc[:,X_test.columns != 'Unique_ID'].dropna())
#Number of columns in train_test and X_test should match
'''
#######Bin continuous variables in groups: use cut() to cut the values for a column in bins
# Define your own bins
mybins = range(int(train["N35"].min()), int(train["N35"].max()), 10)
# Cut the data with the help of the bins
train['N35_bucket'] = pd.cut(train['N35'], bins=mybins)
# Count the number of values per bucket
train['N35_bucket'].value_counts()
#Encode train['N35_bucket'] for further use
train['N35_bucket_encoded_for_use'] = train['N35_bucket'].factorize()[0]
######Feature importances using # Import `RandomForestClassifier`
from sklearn.ensemble import RandomForestClassifier
names = features
# Build the model
rfc = RandomForestClassifier()#parameters give as input
# Fit the model
XX = train.dropna()
del XX['Unique_ID']
YY = train.dropna()['Dependent_Variable']
rfc.fit(XX,YY)
# Print the results
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), names), reverse=True))
# Pearson correlation
train_test.corr()
# Kendall Tau correlation
train_test.corr('kendall')
# Spearman Rank correlation
train_test.corr('spearman')
##########################################################################################################################################
###############################################Data Cleaning##############################################################################
##########################################################################################################################################
##########################################################################################################################################
#####################################Check if train and test set are showing same kind of characteristics###############################
##########################################################################################################################################
#Check their summaries - describe(), plots. This is to be done in the same way as a "sample is proper reresentation of the population" is checked
##########################################################################################################################################
###############################################Feature Engineering########################################################################
##########################################################################################################################################
####One Hot Encoding
#Source: https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
def one_hot_encode_fns(col_to_encode,train):
integer_encoded = label_encoder.fit_transform(train[col_to_encode].values).reshape(train.shape[0], 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
onehot_encoded_df = pd.DataFrame(onehot_encoded)
onehot_encoded_df.index = train.index
onehot_encoded_df.columns = [col_to_encode + "one_hot" + str(i) for i in range(onehot_encoded_df.shape[1])]
return onehot_encoded_df
col_to_encode = "C6"
#train[col_to_encode] has values like True, False, True
onehot_df = one_hot_encode_fns(col_to_encode,train)
train = pd.concat([train, onehot_df], axis = 1)
del train[col_to_encode]
####Targeting Encoding
#Source: https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
#Theory: http://www.saedsayad.com/encoding.htm
def add_noise(series, noise_level):
return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None, tst_series=None, target=None, min_samples_leaf=1, smoothing=1, noise_level=0):
assert len(trn_series) == len(target)
assert trn_series.name == tst_series.name
temp = pd.concat([trn_series, target], axis=1)
# Compute target mean
averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
# Compute smoothing
smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
# Apply average function to all target data
prior = target.mean()
# The bigger the count the less full_avg is taken into account
averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
averages.drop(["mean", "count"], axis=1, inplace=True)
# Apply averages to trn and tst series
ft_trn_series = pd.merge(
trn_series.to_frame(trn_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
on=trn_series.name,
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
# pd.merge does not keep the index so restore it
ft_trn_series.index = trn_series.index
ft_tst_series = pd.merge(
tst_series.to_frame(tst_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
on=tst_series.name,
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
# pd.merge does not keep the index so restore it
ft_tst_series.index = tst_series.index
return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)
'''
#BETTER APPROACH: Distribute data among train and valid
import random
train_ids = random.sample(list(data.index),int(.8*len(data.index)))
train = data[data.index.isin(train_ids)]
valid = data[~data.index.isin(train_ids)]
'''
#Importance factors also check using xgboost
X_train_all = train_test.iloc[0:X_train.shape[0],:]
X_train_xgb = X_train_all.sample(frac=0.80, replace=False)
X_valid_xgb = pd.concat([X_train_all, X_train_xgb]).drop_duplicates(keep=False)
features=list(set(X_train.columns) - set(['Unique_ID']))
dtrain = xgb.DMatrix(X_train_xgb[features], X_train_xgb['Dependent_Variable'], missing=np.nan)
dvalid = xgb.DMatrix(X_valid_xgb[features], missing=np.nan)
dtest = xgb.DMatrix(X_test[features], missing = np.nan)
nrounds = 800
watchlist = [(dtrain, 'train')]
params = {"objective": "binary:logistic","booster": "gbtree", "nthread": 16, "silent": 1,
"eta": 0.08, "max_depth": 6, "subsample": 0.9, "colsample_bytree": 0.7,
"min_child_weight": 1,
"seed": 2016, "tree_method": "exact"}
bst = xgb.train(params, dtrain, num_boost_round=nrounds, evals=watchlist, verbose_eval=20)
valid_preds = bst.predict(dvalid)
fpr, tpr, thresholds = roc_curve(X_valid_xgb['Dependent_Variable'], valid_preds, pos_label=1)
auc_algo = auc(fpr, tpr)
print(auc_algo)
test_preds = bst.predict(dtest)
submit = pd.DataFrame({'Unique_ID': X_test['Unique_ID'], 'Class_1_Probability': test_preds})
i=i+1
submit[['Unique_ID','Class_1_Probability']].to_csv("XGB"+str(i)+".csv", index=False)
'''
#ENSEMBLING CODE
#sub6.csv, sub18.csv and sub36.csv append
df_all=pd.read_csv("sub6.csv")
for i in [18,36]:
df_all=df_all.append(pd.read_csv("sub"+str(i)+".csv"))
ensembled_ans=df_all.groupby('roadId',as_index=False)['noOfLanes'].agg(lambda x: x.value_counts().index[0])
ensembled_ans.to_csv("sub39.csv",index=False)
'''
###########################################################################################################################
#Plot variable importance using SRK code
def create_feature_map(features):
outfile = open('xgb.fmap', 'w')
for i, feat in enumerate(features):
outfile.write('{0}\t{1}\tq\n'.format(i,feat))
outfile.close()
create_feature_map(features)
bst.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
imp_df.to_csv("imp_feat.txt", index=False)
# create a function for labeling #
def autolabel(rects):
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., 1.02*height,
'%f' % float(height),
ha='center', va='bottom')
#imp_df = pd.read_csv('imp_feat.txt')
labels = np.array(imp_df.feature.values)
ind = np.arange(len(labels))
width = 0.9
fig, ax = plt.subplots(figsize=(12,6))
rects = ax.bar(ind, np.array(imp_df.fscore.values), width=width, color='y')
ax.set_xticks(ind+((width)/2.))
ax.set_xticklabels(labels, rotation='vertical')
ax.set_ylabel("Importance score")
ax.set_title("Variable importance")
autolabel(rects)
plt.savefig('dummy_feature_imp_diagram.png',dpi=1000)
plt.show()
###########################################################################################################################
#Threshold find from auc
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html