In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
demo = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
demo.head()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.pairplot(demo,hue="Outcome",diag_kind='kde')
plt.show()

In [None]:
#Feature Selection
import warnings
warnings.filterwarnings('ignore')
X=demo.drop('Outcome',axis=1)
y=demo[['Outcome']]
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X,y)
gb.feature_importances_

In [None]:
from sklearn.feature_selection import SelectKBest,f_classif
skb = SelectKBest(f_classif,8)
skb.fit_transform(X,y)
skb.pvalues_

In [None]:
plasdiabetic = demo[demo['Outcome']==1]['Glucose']
plasnondiabetic = demo[demo['Outcome']==0]['Glucose']
import scipy.stats as stats
print(stats.ttest_ind(plasdiabetic,plasnondiabetic))

In [None]:
demo.info()

In [None]:
demo.describe()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=20)
#With Pipeline
from sklearn.pipeline import Pipeline
pipe = Pipeline((
("pt",PowerTransformer()),
("lr", LogisticRegression()),
))
pipe.fit(X_train,y_train)
print("Testing Accuracy")
print(pipe.score(X_test,y_test))
print("Training Accuracy")
print(pipe.score(X_train,y_train))

In [None]:
pipe.named_steps['lr'].coef_

In [None]:
demo = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
demo['Glucose'].replace(0,np.nan,inplace=True)
demo['BloodPressure'].replace(0,np.nan,inplace=True)
demo['SkinThickness'].replace(0,np.nan,inplace=True)
demo['Insulin'].replace(0,np.nan,inplace=True)
demo['BMI'].replace(0,np.nan,inplace=True)

In [None]:
demo.info()

In [None]:
#Missing Values with Iterative Imputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
X=demo.drop('Outcome',axis=1)
y=demo['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=20)
it=IterativeImputer()
itXtrain = pd.DataFrame(it.fit_transform(X_train))
itXtest = pd.DataFrame(it.transform(X_test))
pt = PowerTransformer()
powerxtrain = pt.fit_transform(itXtrain)
powerxtest = pt.transform(itXtest)
sc=StandardScaler()
scaledxtrain = sc.fit_transform(powerxtrain)
scaledxtest = sc.transform(powerxtest)
lr = LogisticRegression()
lr.fit(scaledxtrain,y_train)
print("What is the Testing Accuracy")
print(lr.score(scaledxtest,y_test))
print("What is the Training Accuracy")
print(lr.score(scaledxtrain,y_train))

In [None]:
demo=demo.drop('Insulin',axis=1)
demo.head()

In [None]:
#With Pipeline
X=demo.drop('Outcome',axis=1)
y=demo['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=20)
from sklearn.pipeline import Pipeline
pipe = Pipeline((
("it", IterativeImputer()),
("pt",PowerTransformer()),
("sc", StandardScaler()),
("lr", LogisticRegression()),
))
pipe.fit(X_train,y_train)
print("Testing Accuracy")
print(pipe.score(X_test,y_test))
print("Training Accuracy")
print(pipe.score(X_train,y_train))

In [None]:
#Including RFE
#With Pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
pipe = Pipeline((
("it", IterativeImputer()),
("pt",PowerTransformer()),
("sc", StandardScaler()),
("fs",RFE(estimator = LogisticRegression(),n_features_to_select=3, step=1)),
("lr", LogisticRegression()),
))
pipe.fit(X_train,y_train)
print("Testing Accuracy")
print(pipe.score(X_test,y_test))
print("Training Accuracy")
print(pipe.score(X_train,y_train))

In [None]:
pipe.named_steps['lr'].coef_

In [None]:
#Including SelectKBest
#With Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
pipe = Pipeline((
("it", IterativeImputer()),
("pt",PowerTransformer()),
("sc", StandardScaler()),
("skb",SelectKBest(f_classif,k=3)),
("lr", LogisticRegression()),
))
pipe.fit(X_train,y_train)
print("Testing Accuracy")
print(pipe.score(X_test,y_test))
print("Training Accuracy")
print(pipe.score(X_train,y_train))

In [None]:
predicted = pipe.predict(X_test)
from sklearn.metrics import confusion_matrix,classification_report,recall_score,precision_score,f1_score
print(confusion_matrix(y_test,predicted))
print(classification_report(y_test,predicted))

In [None]:
print(recall_score(y_test,predicted,average=None))
print(precision_score(y_test,predicted,average=None))
print(f1_score(y_test,predicted,average=None))

In [None]:
f1_score(y_test,predicted,average=None)[1]

In [None]:
#Evaluating models using Cross Validation
from sklearn.model_selection import cross_val_score
scoreslr = cross_val_score(pipe, X_train, y_train, cv=10, scoring='accuracy')
print(scoreslr)

In [None]:
import numpy as np
print("Average Accuracy of my model")
print(np.mean(scoreslr))
print("SD of accuracy of the model")
print(np.std(scoreslr,ddof=1))

In [None]:
# 95% Confidence Interval of Accuracy
import scipy.stats
xbar = np.mean(scoreslr)
n=10
s = np.std(scoreslr,ddof=1)
se = s/np.sqrt(n)
stats.t.interval(0.95,df=n-1,loc=xbar,scale=se)

In [None]:
#Learning Curve Demo
demo = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt


fig, axes = plt.subplots(3, 2, figsize=(10, 15))

X=demo.drop('Outcome',axis=1)
y=demo['Outcome']

title = "Learning Curves (GradientBoosting)"
# Cross validation with 10 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)

estimator = GradientBoostingClassifier()
plot_learning_curve(estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01),
                    cv=cv, n_jobs=4)

title = r"Learning Curves (LogisticRegression)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)
estimator = LogisticRegression()
plot_learning_curve(estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01),
                    cv=cv, n_jobs=4)

plt.show()