# <p style="background-color:#7cd4fc; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 95px 15px;">Pima Indians Diabetes Database</p>

In [None]:
from pandas import read_csv, DataFrame
from numpy import linspace

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(10,6)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

from sklearn.metrics import accuracy_score, plot_confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# <p style="background-color:#7cd4fc; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 95px 15px;">Analysis</p>

In [None]:
pima = read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
pima.head()

In [None]:
pima.info()

In [None]:
pima.describe()

<div style="color:black; background-color:#b1fc9a; border-radius:10px; padding:20px;">
- Don't have missing values.<br/>
- Columns Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI contains values 0 thar may be incorrectly reported.    
</div>

# <p style="background-color:#7cd4fc; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 95px 15px;">Feature Engineering</p>

## Missing Valuesg

In [None]:
print('Values 0:', repr(pima[pima.Glucose == 0].shape[0]))
print(pima[pima.Glucose == 0].groupby('Outcome')['Outcome'].count())

In [None]:
Glucose_0 = pima[(pima['Glucose']== 0)]
pima[(pima['Glucose']== 0) & (pima['Outcome'] == 0)] = Glucose_0[Glucose_0['Outcome']== 0].replace(0, pima[(pima['Outcome']== 0)].mean())
pima[(pima['Glucose']== 0) & (pima['Outcome'] == 1)] = Glucose_0[Glucose_0['Outcome']== 1].replace(0, pima[(pima['Outcome']== 1)].mean())

In [None]:
print("Values 0:", repr(pima[pima.BloodPressure == 0].shape[0]))
print(pima[pima.BloodPressure == 0].groupby('Outcome')['Outcome'].count())

In [None]:
BloodPressure_0 = pima[(pima['BloodPressure']== 0)]
pima[(pima['BloodPressure']== 0) & (pima['Outcome'] == 0)] = BloodPressure_0[BloodPressure_0['Outcome']== 0].replace(0, pima[(pima['Outcome']== 0)].mean())
pima[(pima['BloodPressure']== 0) & (pima['Outcome'] == 1)] = BloodPressure_0[BloodPressure_0['Outcome']== 1].replace(0, pima[(pima['Outcome']== 1)].mean())

In [None]:
print("Values 0: " + repr(pima[pima.SkinThickness == 0].shape[0]))
print(pima[pima.SkinThickness == 0].groupby('Outcome')['Outcome'].count())

In [None]:
SkinThickness_0 = pima[(pima['SkinThickness']== 0)]
pima[(pima['SkinThickness']== 0) & (pima['Outcome'] == 0)] = SkinThickness_0[SkinThickness_0['Outcome']== 0].replace(0, pima[(pima['Outcome']== 0)].mean())
pima[(pima['SkinThickness']== 0) & (pima['Outcome'] == 1)] = SkinThickness_0[SkinThickness_0['Outcome']== 1].replace(0, pima[(pima['Outcome']== 1)].mean())

In [None]:
print("Número de casos anormais na espessura das dobras cutâneas: " + repr(pima[pima.SkinThickness > 60].shape[0]))
print(pima[pima.SkinThickness > 60]['SkinThickness'])
print(pima[pima.SkinThickness > 60].groupby('Outcome')['Outcome'].count())

In [None]:
pima['SkinThickness'].iloc[579] = pima['SkinThickness'].mean()

In [None]:
print("Missing Values: " + repr(pima[pima.Insulin == 0].shape[0]))
print(pima[pima.Insulin == 0].groupby('Outcome')['Outcome'].count())

In [None]:
Insulin_0 = pima[(pima['Insulin'] == 0)]
pima[(pima['Insulin']== 0) & (pima['Outcome'] == 0)] = Insulin_0[Insulin_0['Outcome']== 0].replace(0, pima[(pima['Outcome']== 0)].mean())
pima[(pima['Insulin']== 0) & (pima['Outcome'] == 1)] = Insulin_0[Insulin_0['Outcome']== 1].replace(0, pima[(pima['Outcome']== 1)].mean())

In [None]:

print("Missing Values: " + repr(pima[pima.BMI == 0].shape[0]))
print(pima[pima.BMI == 0].groupby('Outcome')['Outcome'].count())

In [None]:
# Substitua o valor zero pelo valor médio das classes
BMI_0 = pima[(pima['BMI']== 0)] 
pima[(pima['BMI']== 0) & (pima['Outcome'] == 0)] = BMI_0[BMI_0['Outcome']== 0].replace(0, pima[(pima['Outcome']== 0)].mean())
pima[(pima['BMI']== 0) & (pima['Outcome'] == 1)] = BMI_0[BMI_0['Outcome']== 1].replace(0, pima[(pima['Outcome']== 1)].mean())

<div style="color:black; background-color:#b1fc9a; border-radius:10px; padding:20px;">
The missing values and 0 have been replaced by the average.
</div>

# <p style="background-color:#7cd4fc; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 95px 15px;">Plots</p>

In [None]:
plt.figure(figsize=(18,10))
sns.heatmap(pima.corr(), annot=True, cmap=plt.cm.Blues);

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(pima['Outcome']);

<div style="color:black; background-color:#b1fc9a; border-radius:10px; padding:20px;">
We have more values 0 than 1, the dataset is unbalanced.  
</div>

In [None]:
columns = pima.columns[:8]
plt.subplots(figsize=(18,20))
length = len(columns)

for i,j in zip(columns,range(length)):
    plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.4, hspace=.5)
    sns.distplot(pima[i])
    plt.title(i)
plt.show()

In [None]:
pima.plot(kind='box', subplots=True, layout=(3,3), figsize=(14,10));

<div style="color:black; background-color:#b1fc9a; border-radius:10px; padding:20px;">
Looking at the chart above we can see several outliers
</div>

# <p style="background-color:#7cd4fc; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 95px 15px;">Model</p>

In [None]:
x = pima.drop(columns=['Outcome'])
y = pima['Outcome']

scaler = MinMaxScaler()
x = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x , y, test_size=0.20, stratify=y)

In [None]:
colunas = ['Modelo','Acuracy']
resultado = DataFrame(columns=colunas)


models = []
models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
models.append(('BaggingClassifier', BaggingClassifier()))
models.append(('RandomForestClassifier', RandomForestClassifier()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
models.append(("XGBClassifier", XGBClassifier()))
models.append(("LGBMClassifier", LGBMClassifier()))

for nome, model in models:
    scores = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')
    print("Train CV Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), nome))
    md = model.fit(x_train, y_train)    
    print("Test Accuracy: %0.2f " % (accuracy_score(model.predict(x_test), y_test)))
    print('_'*75)
    print('')

In [None]:
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)
plot_confusion_matrix(model, x_test, y_test, cmap=plt.cm.Blues);
plt.grid(False)
plt.title('Result');