In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn-whitegrid")
import warnings
warnings.filterwarnings("ignore")

# Load and Check Data
Let's upload our data set and make small reviews. We will do a detailed review later.

In [None]:
data = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
data.head(10)

In [None]:
data

In [None]:
data.columns.values.reshape(-1,1)

In [None]:
data.describe().T

# Veriable Description 
1. Pregnancies: <br>
   Number of times pregnant
2. Glucose: <br>
   Plasma glucose concentration over 2 hours in an oral glucose tolerance test.
3. BloodPressure: <br>
   Diastolic blood pressure (mm Hg)
4. Skin Thickness: <br>
   Triceps skin fold thickness (mm)
5. Insulin: <br>
   2-Hour serum insulin (mu U/ml)
6. BMI: <br>
   This is a numerical value of your weight in relation to your height. A BMI between 18.5 and 25 kg/m² indicates a normal weight. A BMI of less than 18.5 kg/m² is considered underweight. A BMI between 25 kg/m² and 29.9 kg/m² is considered overweight.
7. DiabetesPedigreeFunction: <br>
   Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)
8. Age: <br>
   Age (years)
9. Outcome: <br>
   Class variable (0 if non-diabetic, 1 if diabetic)  

In [None]:
data.info()

# Univariate Variable Analysis

In [None]:
def plot_hist(variable):
    plt.figure(figsize=(9,3))
    plt.hist(data[variable],bins=50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.show()
    
    
columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness',
       'Insulin','BMI', 'DiabetesPedigreeFunction','Age','Outcome']

for col in columns:
    plot_hist(col)

# Data Preparation & Outlier Detection <a id="2"></a>

In [None]:
data.isnull().sum()

![](://media3.giphy.com/media/MVDPX3gaKFPuo/giphy.gif?cid=ecf05e47486a445c592a49fb8fca7f525cdd691411c8d66a&rid=giphy.gif)
Yesss. All data is full. We should celebrate this.

In [None]:
# Outlier Detection 
for col in columns:
    sns.boxplot(x = data[col])
    plt.show()

In [None]:
data['Glucose'].fillna(data['Glucose'].mean(), inplace = True)
data['BloodPressure'].fillna(data['BloodPressure'].mean(), inplace = True)
data['SkinThickness'].fillna(data['SkinThickness'].median(), inplace = True)
data['Insulin'].fillna(data['Insulin'].median(), inplace = True)
data['BMI'].fillna(data['BMI'].median(), inplace = True)

In [None]:
data.shape

In [None]:
import missingno as msno
p=msno.bar(data)

In [None]:
sns.countplot(x=data.Outcome,data=data)
plt.show()

In [None]:
g = sns.factorplot(x="Outcome",y="BMI",data=data,kind="bar")
g.set_ylabels("BMI")
plt.show()

### Pregnancies

In [None]:
g = sns.factorplot(x="Outcome",y="Pregnancies",data=data,kind="bar")
g.set_ylabels("Pregnancies")
plt.show()

### Glucose

In [None]:
g = sns.factorplot(x="Outcome",y="Glucose",data=data,kind="bar")
g.set_ylabels("Glucose")
plt.show()

### BloodPressure

In [None]:
g = sns.factorplot(x="Outcome",y="BloodPressure",data=data,kind="bar")
g.set_ylabels("BloodPressure")
plt.show()

### SkinThickness

In [None]:
g = sns.factorplot(x="Outcome",y="SkinThickness",data=data,kind="bar")
g.set_ylabels("SkinThickness")
plt.show()

### Insulin

In [None]:
g = sns.factorplot(x="Outcome",y="Insulin",data=data,kind="bar")
g.set_ylabels("Insulin")
plt.show()

### BMI

In [None]:
g = sns.factorplot(x="Outcome",y="BMI",data=data,kind="bar")
g.set_ylabels("BMI")
plt.show()

### DiabetesPedigreeFunction

In [None]:
g = sns.factorplot(x="Outcome",y="DiabetesPedigreeFunction",data=data,kind="bar")
g.set_ylabels("DiabetesPedigreeFunction")
plt.show()

### Age

In [None]:
g = sns.factorplot(x="Outcome",y="Age",data=data,kind="bar")
g.set_ylabels("Age")
plt.show()

In [None]:
f,ax = plt.subplots(figsize=(10,8))
corr = data.corr()
sns.heatmap(
    corr,
    mask= np.zeros_like(corr, dtype=np.bool),
    cmap= sns.diverging_palette(240,10,as_cmap=True),
    square= True,
    ax=ax
    )
plt.show()

# Modelling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn import metrics
from sklearn.svm import SVR

# K-Nearest Neighbors
* Estimation is made on the similarities of the observations.
* In order to estimate the "Y", which is the dependent variable value of the observation unit whose independent variable values ​​are given, the decoration of the relevant observation units with the other observation units in the table will be calculated.
<br><br>
- Determine the number of neighbors (K).
- Calculate distances from unknown point to all other points.
- Select the closest k-observation, with distances in order and according to the specified k number.
- Classification is the most common class, and regression is the average value as the estimated value.

In [None]:
y = data["Outcome"].values
x_data = data.drop(["Outcome"],axis=1)
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

print("X_train: ", len(X_train))
print("X_test: ", len(X_test))
print("y_train: ", len(y_train))
print("y_test: ", len(y_test))

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=3).fit(X_train,y_train)
y_pred = knn_model.predict(X_test)
y_pred

In [None]:
print("mean of error squares: ", np.sqrt(mean_squared_error(y_test,y_pred)))
print("Score: ", knn_model.score(X_test,y_test)*100)

In [None]:
# find k value
score_list = []

for each in range(1,100):
    knn2 = KNeighborsClassifier(n_neighbors=each)
    knn2.fit(X_train,y_train)
    score_list.append(knn2.score(X_test,y_test))
    
plt.plot(range(1,100),score_list)
plt.xlabel("k values")
plt.ylabel("accuracy")
plt.show()

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=43).fit(X_train,y_train)
y_pred = knn_model.predict(X_test)
print("mean of error squares: ", np.sqrt(mean_squared_error(y_test,y_pred)))
print("Accuracy: ", knn_model.score(X_test,y_test)*100)

In [None]:
# Another way
param_grid = {'n_neighbors':np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5,n_jobs=-1,verbose=2).fit(X_train,y_train)
y_pred = knn_cv.predict(X_test)

In [None]:
print("Accuracy: ", knn_cv.best_score_*100)

# Support Vector Classification
* It is one of the strong and flexible modeling techniques. It can be used for classification and regression. Robust is a regression modeling technique.
* But it is to determine a line or curve to a margin range so that it can get the maximum point with the smallest error.

In [None]:
zero = data[data["Outcome"] == 0]
one = data[data["Outcome"] == 1]
# scatter plot
plt.scatter(zero.BMI,zero.DiabetesPedigreeFunction,color="red",label="zero",alpha= 0.3)
plt.scatter(one.BMI,one.DiabetesPedigreeFunction,color="green",label="one",alpha= 0.3)
plt.legend()
plt.show()

In [None]:
y = data["Outcome"].values
x_data = data.drop(["Outcome"],axis=1)
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

svr_model = SVR("linear").fit(X_train,y_train)
svr_model

In [None]:
y_pred = svr_model.predict(X_test)
print("mean of error squares: ", np.sqrt(mean_squared_error(y_test,y_pred)))
print("Accuracy: ", svr_model.score(X_test,y_test)*100)

In [None]:
# GRID SEARCH METHOD
svr_model = SVR('linear')
svr_params = {
    "C": [0.1, 0.5, 1, 3]  # penalty coefficient values
}
svr_cv_model = GridSearchCV(svr_model, svr_params, cv=5, verbose=2, n_jobs=-1).fit(X_train,y_train)

In [None]:
svr_cv_model.best_params_

In [None]:
print("Accuracy: ",svr_cv_model.best_score_*100)

# Artificial Neural Network
* It is one of the powerful machine learning algorithms that can be used for classification and regression problems that refer to the way the human brain processes information.
* The aim is to reach the coefficients that can make estimates with the smallest error.
![](https://www.researchgate.net/profile/Facundo_Bre/publication/321259051/figure/fig1/AS:614329250496529@1523478915726/Artificial-neural-network-architecture-ANN-i-h-1-h-2-h-n-o.png)


## Artificial Neural Network - Model & Predict

In [None]:
data.head()

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
y = data["Outcome"].values
x_data = data.drop(["Outcome"],axis=1)
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

mlp_model = MLPClassifier().fit(X_train,y_train)
mlp_model

In [None]:
y_pred = mlp_model.predict(X_test)
print("mean of error squares: ", np.sqrt(mean_squared_error(y_test,y_pred)))
print("Accuracy: ", mlp_model.score(X_test,y_test)*100)

In [None]:
# another way
mlp_params = {
    "alpha": [0.1, 0.01, 0.02, 0.001, 0.0001],
    "hidden_layer_sizes": [(10,20),(5,5),(100,100)]
}

mlp_cv_model = GridSearchCV(mlp_model,mlp_params,cv=10,verbose=2,n_jobs=-1).fit(X_train,y_train) 

In [None]:
print("Accuracy: ", mlp_cv_model.best_score_*100)

# Classification Tree (CART) 
* The aim is to transform the complex structures in the dataset into simple decision structures.
* Heterogeneous data sets are divided into homogeneous subgroups according to a specified target variable.

In [None]:
y = data["Outcome"].values
x_data = data.drop(["Outcome"],axis=1)
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

card_model = DecisionTreeClassifier(random_state=42)
card_model.fit(X_train,y_train)
y_pred = card_model.predict(X_test)
np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
print("mean of error squares: ", np.sqrt(mean_squared_error(y_test,y_pred)))
print("Accuracy: ",card_model.score(X_test,y_test)*100)

In [None]:
# another way
card_params = {
    "max_depth": [2,3,4,5,10,20],
    "min_samples_split": [2,10,5,30,50,10]
}
card_model = DecisionTreeClassifier()
card_cv_model = GridSearchCV(card_model,card_params,cv=10,n_jobs=-1,verbose=2).fit(X_train,y_train)

In [None]:
print("Accuracy: ",card_cv_model.best_score_*100)

# Random Forests
* The basis is based on gathering and evaluating the estimates produced by more than one decision tree created by the bootstrap method.
- Lowers the mean square root value of the error squares.
- Increases the correct classification rate.
- It reduces variance and is resistant to memorization.
- Combining the predictions produced by multiple decision trees, the basis of
- Observations for trees are selected using the random sample selection method of bootstrap, and random subspace method.
- In each node of the decision tree, the best branching variable is chosen among the less number of variables randomly selected from all variables.
- 2/3 of the data set is used in tree building. Outside data is used for performance evaluation of trees and determination of variable significance.
- Random variables are selected at each joint.

In [None]:
y = data["Outcome"].values
x_data = data.drop(["Outcome"],axis=1)
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

rf_model = RandomForestClassifier(random_state=42).fit(X_train,y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy: ", rf_model.score(X_test,y_test))

In [None]:
# another way
rf_params = {
    "max_depth": [5,8,10],
    "max_features": [2,5,10],
    "n_estimators": [200,500,1000,2000],
    "min_samples_split": [2,10,80,100]
}

rf_cv_model = GridSearchCV(rf_model,rf_params,cv=10,n_jobs=-1,verbose=2).fit(X_train,y_train)

In [None]:
print("Accuracy: ", rf_cv_model.best_score_*100)

# Gradient Boosting Machines (GBM)
* AdaBoost is a generalized version that can be easily adapted to classification and regression problems.
* A series of models in the form of a single predictive model is established on the residuals.
* Finding coefficients or decision rules to minimize the mean of error squares.
* Gradient Boosting creates a series of models in the form of a single predictive model.
* A model in the series is created by overwriting the prediction residues / errors of the previous model in the series (feet).
* GBM can use many basic learner types.
* COST functions and link functions can be modified.
* Boosting + Gradient Descent

### Introduction to Boosting Methods
* Weak students are based on the idea of ​​coming together and revealing a strong student.
* Bad estimation is the big value that results from squaring the difference between the real and the estimated values. Trees that make bad predictions are also weak estimators.

### AdaBoost (Adaptive Boosting)
* It is the algorithm that brings the idea of ​​weak classifiers to come together and form a strong classifier.

![](https://miro.medium.com/proxy/1*m2UHkzWWJ0kfQyL5tBFNsQ.png)

* Classification process has been made in the 1st picture
* + Symbols in the box represent blue classification, - symbols represent red classification. In this case, some symbols are in the wrong box.
* In the second picture, a classification was made again considering the situations in the first picture.
* In the third picture, a new classification was made by taking the second picture into consideration.
* The 4th image classification process has been provided successfully.

In [None]:
y = data["Outcome"].values
x_data = data.drop(["Outcome"],axis=1)
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

gbm_model = GradientBoostingClassifier().fit(X_train,y_train)
y_pred = gbm_model.predict(X_test)
print("Accuracy: ", gbm_model.score(X_test,y_test)*100)

In [None]:
# another way
gbm_model = GradientBoostingClassifier().fit(X_train,y_train)
gbm_params = {
    "learning_rate": [0.001,0.1,0.001],
    "max_depth": [3,5,8],
    "n_estimators": [100,200,500],
    "min_samples_split": [1,0.5,0.8],
}

gbm_cv_tuned = GridSearchCV(gbm_model,gbm_params,cv=10,n_jobs=-1,verbose=2).fit(X_train,y_train)

In [None]:
print("Accuracy: ", gbm_cv_tuned.best_score_*100)

# XGBoost
* XGBoost is the scalable and scalable version of GBM optimized to increase speed and forecast performance.
* It is scalable.
* It is fast.
* Prediction success is high.
* It proved its success in many kaggle competitions.

In [None]:
import xgboost
from xgboost import XGBClassifier

y = data["Outcome"].values
x_data = data.drop(["Outcome"],axis=1)
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

xgb = XGBClassifier().fit(X_train,y_train)
print("Accuracy: ", xgb.score(X_test,y_test)*100)

In [None]:
xgb = XGBClassifier()

xgb_params = {
    "learning_rate": [0.01,0.01,0.5],
    "max_depth": [2,3,4,5,8],
    "n_estimators": [100,200,500,1000],
    "colsample_bytree": [0.4,0.7,1]
}

xgb_cv_model = GridSearchCV(xgb,xgb_params,cv=10,n_jobs=-1,verbose=1).fit(X_train,y_train)

In [None]:
print("Accuracy: ", xgb_cv_model.best_score_*100)

# LightGBM
* Light GBM is another type of GBM developed to increase XGBoost's training time performance.
* More performance.
* Leaf-wise growth strategy instead of level-wise growth strategy.
* Depth-first search (DFS) instead of Breadth-first search (BFS).

In [None]:
y = data["Outcome"].values
x_data = data.drop(["Outcome"],axis=1)
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier().fit(X_train,y_train)
print("Accuracy: ", lgb_model.score(X_test,y_test)*100)

In [None]:
lgbm_params = {
    "learning_rate": [0.01,0.1,0.5,1],
    "n_estimators": [20,40,100,200,500,1000],
    "max_depth": [1,2,3,4,5,6,7,8,9,10]
}

lgbm_cv_model = GridSearchCV(lgb_model,lgbm_params,cv=10,n_jobs=-1,verbose=2).fit(X_train,y_train)

In [None]:
print("Accuracy: ", lgbm_cv_model.best_score_*100)

# CatBoost (Category Boosting)
* Another fast, successful GBM derivative that can automatically combat categorical variables.
* Categorical variable support
* Fast and scalable GPU support
* More successful predictions
* Fast train and fast prediction
* Russia's first open source, successful ML study

In [None]:
y = data["Outcome"].values
x_data = data.drop(["Outcome"],axis=1)
X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

from catboost import CatBoostClassifier
catb_model = CatBoostClassifier(random_state=42).fit(X_train,y_train)
print("Accuracy: ", catb_model.score(X_test,y_test)*100)

In [None]:
# another way
catb_params = {
    "iterations": [200,500,100],
    "learning_rate": [0.01,0.1],
    "depth": [3,6,8]
}
catb_model = CatBoostClassifier()
catb_cv_model = GridSearchCV(catb_model,catb_params,cv=5,n_jobs=-1,verbose=2).fit(X_train,y_train)

In [None]:
print("Accuracy: ", catb_cv_model.best_score_*100)

# Automation of Machine Learning Tasks

In [None]:
def compML(df,alg):
    y = data["Outcome"].values
    x_data = data.drop(["Outcome"],axis=1)
    X = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

    model = alg().fit(X_train,y_train)
    model_name = alg.__name__
    print(model_name,": ", model.score(X_test,y_test)*100)

In [None]:
models = [
    LGBMClassifier,
    XGBClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    DecisionTreeClassifier,
    MLPClassifier,
    KNeighborsClassifier,
    SVR
]

for i in models:
    compML(data,i)

There are better results found with GridSearch above. I do not repeat because the procedures are long.