In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Notebook is about Predicting whether a female is diabetic or not based on the features provided.



There are few general steps always performed in ML predictions. Let's go step by step.




**Dataset Description**

1. Pregnancies: No. of times pregnant
2.  Glucose: Plasma Glucose Concentration (mg/dl)
3. Blood Pressure: Diastolic Blood Pressure(mmHg)
4. Skin Thickness:A value used to estimate body fat. Normal Triceps SkinFold Thickness in women is 23mm. Higher thickness leads to obesity and chances of diabetes increases.
5. Insulin: 2-Hour Serum Insulin (mu U/ml)
6. BMI: Body Mass Index (weight in kg/ height in m2)
7. Diabetes Pedigree Function: It provides information about diabetes history in relatives and genetic relationship of those relatives with patients. Higher Pedigree Function means patient is more likely to have diabetes.
8. Age:Age (years)
9. Outcome: Class Variable (0 or 1) where ‘0’ denotes patient is not having diabetes and ‘1’ denotes patient having diabetes.


# New Facts based on the Field details.

Skin Thickness >23mm leads to higher chances of diabetes. 

With this information we can derive a new feature "High Risk".
If Skin Thickness>23 then HR= 1 else HR=0


In [None]:
#importing Libraries
import pandas as pd
import numpy as np

In [None]:
df= pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df.head()
X=df.iloc[:,0:8]
y=df.iloc[:,-1]
X

#Data Analysis
1. Describe function , this provides a very clear picture of all the fields in your data.

In [None]:
df.describe()

 1. There is no missing values in the data.
 2. Min value =0 in Pregnancy , Glucose, BP, SkinThickness, Insulin, BMI which is practically not possible so we will handle it by replacing it with NaN
 3. There is extreme variation(std,mean) across the fields hence we will require to Standardize the data so that they fall in the same range.
 *with Standardization data values gets converted to (-3 to 3 )range

In [None]:
#libraries for plotting data
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
matrix = np.triu(df.corr())
sns.heatmap(df.corr(),annot=True,fmt='.1g',vmin=-1, vmax=1, center= 0,cmap='YlOrRd',mask=matrix)


What do you get from this 

1. Outcome :- 
    Glucose and BMI are highly correlated 
2. Age and Pregnancy are positively corelated i.e. greater the age , more will be the number of pregnancies.
3. SkinThickness has high relevance to BMI and Insulin.

There is not a single feature that doesnot have direct or indirect impact over outcome. 


**Data Preprocessing**

it involves 
Treating missing values
  1. Remove them simply if values are not critical or doesnt form a major part of your data.
  2. Replace them with average or median values. Depends on the business decision.

Dealing with outliers 
Standardize your data

In [None]:
feature=X.columns
dfzero=(X[feature]==0).sum()
dfzero

In [None]:
X[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI']]=X[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
X.isnull().sum()

**Fill Na with Average of the data values
NA values can be treated with Mean, Median and Mode depending on the type of data and the no. of values missing.**

In [None]:
X['Glucose'].fillna(X['Glucose'].mean(),inplace=True)
#inplace=True is necessary
X['BMI'].fillna(X['BMI'].mean(),inplace=True)
X['Pregnancies'].fillna(X['Pregnancies'].mean(),inplace=True)
X['BloodPressure'].fillna(X['BloodPressure'].mean(),inplace=True)
X['SkinThickness'].fillna(X['SkinThickness'].mean(),inplace=True)
X['Insulin'].fillna(X['Insulin'].mean(),inplace=True)


In [None]:
X.describe() # there is minimum value across every feature, no more error values.

In [None]:
X.isnull().sum() #there are no more null values

Deriving AgeBracket based on the data pattern which shows 
with increase in Age, Glucose Level increases leading to Higher risk of Diabetics

AgeBracket :- 

>50 patients are at high risk so I have given AgeBracket=3

>30 and <50 are at med risk so AgeBracket=2

<30 are at low risk so AgeBracket=1








In [None]:
# Age Bracket based on the input values of Age Vs Glucose
X1=X['Glucose']
Y1=X['Age']

plt.scatter(X1,Y1)
plt.xlabel('Glucose Level')
plt.ylabel('Age')
plt.title(label='Age Vs Glucose Chart')
plt.show()

In [None]:
#Derive new columns
X['HighRisk']= np.where(X['SkinThickness']>23,1,0) # derived column based on the input values

X['AgeBracket'] = np.select([
    (X.Age >= 50),
    (X.Age >= 30) & (X.Age <50),
    (X.Age <30)
], [3, 2, 1])


In [None]:
X.head()
X=X.drop(columns=['Age','SkinThickness'])

In [None]:
X.head()

SPLIT DATASET INTO TRAINING AND TEST DATASET

---



In [None]:
from sklearn.model_selection import train_test_split 
Xtrain,Xtest,ytrain,ytest= train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
Xtrain.shape #training data has 614 rows and 8 columns


In [None]:
Xtest.shape #test dataset has 154 rows and 8 columns

In [None]:
Xtrain.head()

Data Standardization is the best method to remove outliers and set your data in range.

* First Fit your data on the training dataset and based on that transform your testdataset. 

P.S:- **do not fit on test dataset.**

In [None]:
from sklearn.preprocessing import StandardScaler
Sc= StandardScaler()
Xtrain.iloc[:,:6]=Sc.fit_transform(Xtrain.iloc[:,:6])
Xtest.iloc[:,:6]=Sc.transform(Xtest.iloc[:,:6])

In [None]:
Xtrain # all the values have been converted in the range of -3 to +3 

# Build Data Model

In [None]:
from sklearn.svm import SVC
SVC_Classifier=SVC()
SVC_Classifier.fit(Xtrain,ytrain)

In [None]:
ypred=SVC_Classifier.predict(Xtest) # Predicting Data Values

# Evaluating Result
1. CONFUSION MATRIX
2. ACCURACY SCORE
3. ROC CURVE

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score
cm=confusion_matrix(ytest,ypred)
cm


95 :- True Postive i.e. out of 154 values 94 values (1) are correctly predicted as 1

26 :- True Negative i.e. 27 values (0) are correctly predicted as 0 

12 :- False Positive i.e 13 values (1) are incorrectly predicted as 0

21 :- False Negative i.e. 20 values (0) are incorrectly predicted as 1

P.S **Model outcome looks really appreciative.**


In [None]:
score=accuracy_score(ytest,ypred)
print('Accuracy Score',score)

Yay..!! 78 % but it can  be improved 

In [None]:
from sklearn.metrics import roc_curve
fpr,tpr,_=roc_curve(ytest,ypred)
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# Trying Random Forest Classifier now

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_Classifier=RandomForestClassifier()
RF_Classifier.fit(Xtrain,ytrain)

In [None]:
ypred_RF= RF_Classifier.predict(Xtest)

 **Evaluating RF Classifier**


In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score
cm_rf=confusion_matrix(ytest,ypred_RF)
cm_rf


Lets Check Score..!!

In [None]:
score_rf=accuracy_score(ytest,ypred_RF)
print('Score based on RandomForest model',score_rf)

In [None]:
from sklearn.metrics import roc_curve
fpr,tpr,_=roc_curve(ytest,ypred_RF)
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')


**Please give a upvote if you liked my efforts. #MLBeginner**