In [None]:
import pandas as pd, numpy as np, seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv("../input/diabetes-data-set/diabetes.csv")

In [None]:
data.head()

In [None]:
data.info()

feature-selection-techniques

In [None]:
corrmat = data.corr()
top_corr_feat = corrmat.index
plt.figure(figsize=(7,7))
#plot heat map
g = sns.heatmap(data[top_corr_feat].corr(),annot=True,
               cmap='gist_rainbow')

In [None]:
#select from model technique for feature importance

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier 

feat = data.drop("Outcome",axis=1)
target = data["Outcome"]

feature_names = np.array(feat.columns)
RFC = RandomForestClassifier().fit(feat,target)
importance = np.abs(RFC.feature_importances_)
sns.barplot(x=importance, y=feature_names)
plt.title("Feature importances")
plt.show()

In [None]:
#feature importance
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(feat,target)
model.feature_importances_

In [None]:
feat_importance = pd.Series(model.feature_importances_, index=feat.columns)
feat_importance.plot(kind='barh')
plt.show()

In [None]:
#univariate selection
#apply selctkbest to selct top 5 features

from sklearn.feature_selection import SelectKBest, chi2

bestfeatures = SelectKBest(score_func = chi2, k=5)
fit = bestfeatures.fit(feat,target)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(feat.columns)
#concat the two dataframes for better viz
feat_scores = pd.concat([dfcolumns,dfscores],axis=1)
feat_scores.columns =['Feature', 'Score']
feat_scores.nlargest(5, 'Score') #top 5 features

In [None]:
report = feat_scores.nlargest(5, 'Score')

In [None]:
#use top features
optimum_features = report['Feature']

In [None]:
new_data = data.loc[0:,list(optimum_features)].join(data["Outcome"])

In [None]:
new_data.head()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)
X_pca = pca.fit_transform(new_data.drop('Outcome',axis=1))
PCA_df = pd.DataFrame(data = X_pca, columns = ['PC1'])
PCA_df = pd.concat([PCA_df, new_data['Outcome']], axis = 1)
PCA_df.head()

In [None]:
sns.regplot(x=PCA_df['PC1'],
            y = PCA_df['Outcome'], color = 'red',
           marker = '+', fit_reg = True)
plt.show()

In [None]:
#split dataset into training and test set
from sklearn.model_selection import train_test_split
X = new_data.drop("Outcome",axis=1).values
y = new_data["Outcome"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 45, shuffle = True, stratify = y)

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC,ExtraTreesClassifier as XTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import *
mcc= make_scorer(matthews_corrcoef)

def evaluate_model(cv):
    model = RFC()
    # evaluate the model
    scores = cross_val_score(model, X_train, y_train,
                             scoring= mcc,
                             cv=cv, n_jobs=-1)
    # return scores
    return scores.mean()

In [None]:
#iterate over a range of folds to get best K value:

folds = range(5,11)

# record mean and min/max of each set of results
means = list()
# evaluate each k value

for k in folds:
    # define the test condition
    cv = KFold(n_splits=k, shuffle=True, random_state=1)
    # evaluate k value
    k_mean = evaluate_model(cv)
    # report performance
    print('> folds=%d, rfc mean score = %.3f ' % (k, k_mean))
    # store mean accuracy
    means.append(k_mean)

In [None]:
#save randomforestclassif
model = RFC()
model.fit(X_train,y_train)
import pickle
model1 = pickle.dumps(model)

In [None]:
#evaluate extratreesclassif

def evaluate_model(cv):
    model = XTC()
    # evaluate the model
    scores = cross_val_score(model, X_train, y_train,
                             scoring= mcc,
                             cv=cv, n_jobs=-1)
    # return scores
    return scores.mean()


In [None]:
folds = range(5,11)

# record mean and min/max of each set of results
means = list()
# evaluate each k value

for k in folds:
    # define the test condition
    cv = KFold(n_splits=k, shuffle=True, random_state=1)
    # evaluate k value
    k_mean = evaluate_model(cv)
    # report performance
    print('> folds=%d, xtc mean score = %.3f ' % (k, k_mean))
    # store mean accuracy
    means.append(k_mean)

In [None]:
#save xtratreesclassif
model = XTC()
model.fit(X_train,y_train)
model2 = pickle.dumps(model)

In [None]:
rfc = pickle.loads(model1)
xtc = pickle.loads(model2)

In [None]:
#stack classifier with extratrees and randomforest as base estimators

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression as LR
base_model, end_model = [('random_forest',rfc),('xtra_trees',xtc)], LR()
final_model = StackingClassifier(base_model,end_model, cv=10)
scores = cross_val_score(final_model, X_train, y_train,
                              scoring= mcc,
                              cv=10, n_jobs=-1)
scores.mean()

In [None]:
final_model.fit(X_train, y_train)
# #save the final_model
model3 = pickle.dumps(final_model)

In [None]:
#run predictions with the 3 models
rfc_pred = rfc.predict(X_test)
xtc_pred = xtc.predict(X_test)
final_model_pred = final_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report as report, confusion_matrix as cm
print("report on random forest classifier : \n", report(y_pred=rfc_pred,y_true=y_test))
print('\n')
print("report on extra trees classifier : \n", report(y_pred=xtc_pred,y_true=y_test))
print('\n')
print("report on stacked classifier : \n", report(y_pred=final_model_pred,y_true=y_test))

In [None]:
print("matrix of random forest classifier : \n", cm(y_pred=rfc_pred,y_true=y_test,labels=[0,1]))
print('\n')
print("matrix of extra trees classifier : \n", cm(y_pred=xtc_pred,y_true=y_test,labels=[0,1]))
print('\n')
print("matrix of stacked classifier : \n", cm(y_pred=final_model_pred,y_true=y_test,labels=[0,1]))