## Name: Jay Shah
## Date: 11-08-2021
### Pima Indians Diabetes Analysis 

In [None]:
import numpy as np
import pandas as pd
import xgboost
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier as rfc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import confusion_matrix

from mlxtend.plotting import plot_confusion_matrix

## 1. Reading the dataframe

In [None]:
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df

In [None]:
print('Total number of rows are:',df.shape[0])
print('Total number of columns are:',df.shape[1])

In [None]:
df.columns

In [None]:
df.describe()

## 2. Checking if there are any NA values present or not 

In [None]:
df.isna().sum(axis=0)

## 3. Building a correlation matrix

In [None]:
df.corr()

In [None]:
correlation_mat = df.corr()
corr_features = correlation_mat.index
plt.figure(figsize=(20,20))
g = sns.heatmap(df[corr_features].corr(),annot=True,cmap='RdYlGn')
plt.show()

## 4. Checking whether the dataset is balanced or not 

In [None]:
print('There are total',df['Outcome'].nunique(),'unique values in the outcome column')
print('Unique values in outcome column are',df['Outcome'].unique())

In [None]:
print('Total number of 0(False count) are',(df['Outcome']==0).sum())
print('Total number of 1(True count) are',(df['Outcome']==1).sum())

In [None]:
plt.figure(figsize=(5,5))
colors = ['#04FFCD','#FF04E6']
sns.countplot(x='Outcome',data=df,palette=colors)
plt.show()

## 5. Checking some consistency in the dataset 

In [None]:
df.head(15)

> From the above output dataframe, it is clearly visible that some of the features have 0 as a value. Hence, in this scenario one can say that the dataset is not consistent as these values just cannot be 0. Below I will just check that how many missing 0's are present in the feature columns. 

> In order to compute the 0 values, I have computed the mean of a feature which has 0 values and then replaced that 0 values with the computed mean below.

In [None]:
print('Number of rows missing Glucose: {0}'.format(len(df.loc[df['Glucose'] == 0])))
print('Number of rows missing Blood Pressure: {0}'.format(len(df.loc[df['BloodPressure'] == 0])))
print('Number of rows missing Insulin: {0}'.format(len(df.loc[df['Insulin'] == 0])))
print('Number of rows missing BMI: {0}'.format(len(df.loc[df['BMI'] == 0])))
print('Number of rows missing Skin Thickness: {0}'.format(len(df.loc[df['SkinThickness'] == 0])))
print('Number of rows missing Age: {0}'.format(len(df.loc[df['Age'] == 0])))
print('Number of rows missing Diabetes Pedigree Function: {0}'.format(len(df.loc[df['DiabetesPedigreeFunction'] == 0])))

In [None]:
x = df['Glucose'].mean()
df['Glucose'].replace(0,x,inplace=True)
x = df['BloodPressure'].mean()
df['BloodPressure'].replace(0,x,inplace=True)
x = df['Insulin'].mean()
df['Insulin'].replace(0,x,inplace=True)
x = df['BMI'].mean()
df['BMI'].replace(0,x,inplace=True)
x = df['SkinThickness'].mean()
df['SkinThickness'].replace(0,x,inplace=True)

In [None]:
df.head(10)

## 6. Using Autoviz library for data visualization 

In [None]:
!pip install autoviz

In [None]:
!pip install xlrd

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()

In [None]:
dft = AV.AutoViz('/kaggle/input/pima-indians-diabetes-database/diabetes.csv', 
                 dfte=df,
                 header=0, 
                 verbose=2, 
                 lowess=False,
                 chart_format="svg", 
                 max_rows_analyzed=1000, 
                 max_cols_analyzed=10)

> Now, I will split the dataset into training and testing in the below block of code and then I will apply various machine learning algorithms for prediction.



## 7. Splitting the dataset for training & testing and standardizing the data

In [None]:
y = df['Outcome']
X = df.drop(columns=['Outcome'])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train

> Now, we will standardize the whole data.Data standardization is the process of rescaling the attributes so that they have mean as 0 and variance as 1.

> The ultimate goal to perform standardization is to bring down all the features to a common scale without distorting the differences in the range of the values.

> In sklearn.preprocessing.StandardScaler(), centering and scaling happens independently on each feature.

> The formula which performs standardization is $(x-mean)/(sd)$

> fit_transform() is used on the training data so that we can scale the training data and also learn the scaling parameters of that data.

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

In [None]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

## 8. Applying Random Forest Classifier Algorithm for prediction 

In [None]:
rfc_model = rfc(random_state=10)
rfc_model.fit(X_train_scaled, y_train.ravel())

In [None]:
y_predicted = rfc_model.predict(X_test_scaled)
print("Accuracy of Random Forest Model is = {0: .3f}".format(metrics.accuracy_score(y_test,y_predicted)))

In [None]:
y_actual = y_test
y_actual = y_actual.to_numpy() #  COnverting to numpy array
y_actual

> Now, we will calculate mean square error.

In [None]:
print("The mean squared error is:",mse(y_actual,y_predicted))

> Below obtained is a classification report and a confusion matrix is plotted.

In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(y_actual, y_predicted, target_names=target_names))

In [None]:
conf_matrix = confusion_matrix(y_true=y_actual, y_pred=y_predicted)
fig, ax = plot_confusion_matrix(conf_mat=conf_matrix, figsize=(8,8), cmap=plt.cm.Greens)
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

## 9. Hyper-parameter Optimization using RandomizedSearchCV in XgBoost Classifier

In [None]:
params = {
    "learning_rate" : [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4],
    "max_depth" : [3,4,5,6,8,10,12,13,15],
    "min_child_weight" : [1,3,5,7],
    "gamma" : [0,0.1,0.2,0.3,0.4,0.42,0.45],
    "colsample_bytree" : [0.3,0.4,0.5,0.7],
}

In [None]:
xgb_model = xgboost.XGBClassifier(eval_metric='logloss')
random_search=RandomizedSearchCV(xgb_model,
                                 param_distributions=params,
                                 n_iter=5,
                                 scoring='roc_auc',
                                 n_jobs=1,
                                 cv=5,
                                 verbose=3
                                )

> Below is the timer function which will calculate how much time is taken by RandomizedSearchCV

In [None]:
random_search.fit(X_train_scaled,y_train.ravel())

In [None]:
estimator = random_search.best_estimator_
estimator.missing=1
print(estimator)

In [None]:
xgb_model = estimator

In [None]:
xgb_model.fit(X_train_scaled,y_train)

In [None]:
y_predicted = xgb_model.predict(X_test_scaled)
y_predicted

In [None]:
score = cross_val_score(xgb_model,X_train_scaled,y_train.ravel(),cv=10)
score

In [None]:
print("Score obtained after hyper parameter tuning in XgBoost is:",score.mean())

> Below obtained is the classification report

In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(y_actual, y_predicted, target_names=target_names))

> Now, we will calculate mean square error.

In [None]:
print("The mean squared error is:",mse(y_actual,y_predicted))

> Plotting the confusion matrix below:

In [None]:
conf_matrix = confusion_matrix(y_true=y_actual, y_pred=y_predicted)
fig, ax = plot_confusion_matrix(conf_mat=conf_matrix, figsize=(8,8), cmap=plt.cm.Greens)
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()