In [None]:
#Importing the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


#If the file already exists in the project repo uncomment the below line
#data = pd.read_csv("diabetes1.csv")

## Else load the dataset 
data=pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv')

# See the first few lines of data 
data.head()

# Showing the statistical info of data
data.describe()

# Following shows the macro picture
data.info()



## Inference

1) No missing values
2) 0 value not possible for glucose,BP,skin thickness and insulin.This is called as data corruption

## Domain Analysis

1) Pregnancies:-Some women have diabetes before they get pregnant. This is called pregestational diabetes. Other women may get a type of diabetes that only happens in pregnancy. This is called gestational diabetes. Pregnancy can change how a woman's body uses glucose. This can make diabetes worse, or lead to gestational diabetes.

If you have gestational diabetes during pregnancy, generally your blood sugar returns to its usual level soon after delivery. But if you've had gestational diabetes, you have a higher risk of getting type 2 diabetes. You'll need to be tested for changes in blood sugar more often.

The risk of getting diabetes is 28% if the patient has concived more than 2 or 3 times.

2) Glucose:-Glucose is your body's source of fuel. Your pancreas makes insulin to move glucose from your bloodstream into muscle, fat, and liver cells, where your body turns it into energy. People with diabetes have too much blood sugar because their body cannot move glucose into fat, liver, and muscle cells to be changed into and stored for energy.

3) Blood Pressure:-A person with diabetes is twice as likely to have high blood pressure as someone who does not have diabetes. When you have diabetes, high blood sugar can damage your blood vessels and the nerves that help your heart pump.Similarly, high blood pressure can create increased strain on your heart and blood vessels. When these two conditions occur together, they increase the risk of heart disease (cardiovascular disease) and stroke.
High blood pressure:-  According to a 2018 article, people with high blood pressure usually have insulin resistance and have an increased risk of developing diabetes compared to those with typical blood pressure. Blood pressure should be below 140/80mmHg for people with diabetes or below 130/80mmHg if you have kidney or eye disease or any condition that affects blood vessels and blood supply to the brain.

4) Skin Thickness:-Skin thickening is frequently observed in patients with diabetes. Affected areas of skin can appear thickened, waxy, or edematous. These patients are often asymptomatic but can have a reduction in sensation and pain. Although different parts of the body can be involved, the hands and feet are most frequently involved.Diabetes can cause changes in the small blood vessels. These changes can cause skin problems called diabetic dermopathy. Dermopathy often looks like light brown, scaly patches. These patches may be oval or circular.


5) Insulin:-Insulin is a hormone your pancreas makes to lower blood glucose, or sugar. If you have diabetes, your pancreas either doesn't make enough insulin or your body doesn't respond well to it. Your body needs insulin to keep the blood sugar level in a healthy range.Type 1 diabetes causes damage to the beta cells in your pancreas that make insulin. As a result, your body can't produce enough of this hormone. Type 2 diabetes gradually makes it harder for your be


6) BMI:-Body mass index has a strong relationship to diabetes and insulin resistance. In obese individuals, the amount of nonesterified fatty acids, glycerol, hormones, cytokines, proinflammatory markers, and other substances that are involved in the development of insulin resistance, is increased. The pathogenesis in the development of diabetes is based on the fact that the β-islet cells of the pancreas are impaired, causing a lack of control of blood glucose. The development of diabetes becomes more inevitable if the failure of β-islet cells of the pancreas is accompanied by insulin resistance. Weight gain and body mass are central to the formation and rising incidence of type 1 and type 2 diabetes.

8) Age:-The prevalence of both type 2 diabetes and prediabetes increases with advancing age. The most important factors leading to hyperglycaemia are as follows: deficiency of insulin secretion developing with age, and growing insulin resistance caused by a change in body composition and sarcopaenia.The process of aging of the human body leads to impairment of energy homeostasis and abnormalities in carbohydrate metabolism. The most important causes of hyperglycaemia are thought to be deficiency of insulin secretion developing with age and growing insulin resistance.


In [None]:
sns.countplot(x='Pregnancies',data=data)
# Maximum patients have conceived  1 and 0 times.

plt.figure(figsize=(20,25),facecolor='white')
plotnumber=1

for column in data:
    if plotnumber<=9:
        ax=plt.subplot(3,3,plotnumber)
        sns.histplot(data[column])
        plt.xlabel(column,fontsize=20)
        plt.ylabel('Count',fontsize=20)
    plotnumber+=1
plt.tight_layout()
    
## Bivariate Analysis

## Analyzing how preganancies impact the patient with diabetes.
sns.countplot(x='Pregnancies',hue='Outcome',data=data)
plt.show()



## Inferences

1) If the pateint is pregnant,she will get diabetes.But after analyzing this data,we found as the no of pregnenacy increases chances of getting diabetes also increases.
2) From 14 to 17 pregnancies all women are diabetic.

In [None]:
## Aanlyzing the relationship between diabetes and Glucose
sns.histplot(x='Glucose',hue='Outcome',data=data)

## Relationship between diabetes and glucose

1) In the given data, a person having glucose range till 100 can be considered as they having lesser chance of getting diabetes.

2) A person having glucose range between 100 to 150,he is equaly at risk and no risk and we need to analyze more features of the patient.

3) If a person have glucose more than 150,higher is the chance of getting diabetes.

In [None]:
## Analyze Glucose with blood pressure
sns.relplot(x='Glucose',y='BloodPressure',hue='Outcome',data=data)
plt.show()

## Inference

By analyzing BP and Glucose together,we can say we need more features to understand. When glucose is increased to certain level,where even if the BP is normal, the person is diabetic.

In [None]:
## Analyze Glucose with SkinThickness
sns.relplot(x='Glucose',y='SkinThickness',hue='Outcome',data=data)
plt.show()

## Inference

There seem to be no defined relationship between glucose and skin thickness as far as diabetes is concerned from the given data. Thus we need other features to get the relation.

In [None]:
## Analyze relationship between BloodPressure and Outcome

sns.histplot(x='BloodPressure',hue='Outcome',data=data)

## Inference

No specific relationship found.

In [None]:
# Analyze BP with SkinThickness

sns.relplot(x='BloodPressure',y='SkinThickness',hue='Outcome',data=data)
plt.show()

# Analyze BP with Insulin

sns.relplot(x='BloodPressure',y='Insulin',col='Outcome',data=data)
plt.show()

# Analyzing Insulin with target
sns.histplot(x='Insulin',hue='Outcome',data=data)

## Inference

If the person has insulin level till 100,mostly he will be non diabetic afterwards we can see for when insulin increases, people are at high risk of getting diabetic, when it is not utilised 

# Data processing and Feature Engineering

In [None]:
#Step 1 Handling the missing values
data.isnull().sum()

# Step 2 Handling the corrupted data.
data.describe()

## In 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI' certain datapoints are zero.

data.loc[data['Glucose']==0]

data.Glucose.replace(0,np.mean(data.Glucose),inplace=True)

#dataframe.column.replace('Value to be replaced','By what value')

data.loc[data['Glucose']==0]
data.BloodPressure.replace(0,np.mean(data.BloodPressure),inplace=True)
data.SkinThickness.replace(0,np.median(data.SkinThickness),inplace=True)
data.Insulin.replace(0,np.median(data.Insulin),inplace=True)
data.BMI.replace(0,np.mean(data.BMI),inplace=True)

data.describe()

# Step 3:-Numerical representation of string data
import warnings
warnings.filterwarnings("ignore")

## Step 4:-Checking the outliers
plt.figure(figsize=(20,25),facecolor='white')
plotnumber=1

for column in data:
    if plotnumber<=9:
        ax=plt.subplot(3,3,plotnumber)
        sns.boxplot(data[column])
        plt.xlabel(column,fontsize=20)
        plt.ylabel('Count',fontsize=20)
    plotnumber+=1
plt.tight_layout()

# Step 5:-Scaling the data

from sklearn.preprocessing import MinMaxScaler
sc=MinMaxScaler()
dl=['Pregnancies','Outcome']
data1=sc.fit_transform(data.drop(dl,axis=1))

con_data=data[['Pregnancies','Outcome']]

data1

data.columns

type(data1)
data2=pd.DataFrame(data1,columns=['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])

final_df=pd.concat([data2,con_data],axis=1)

final_df



# Feature Selection

In [None]:
# No redundant fetaures
# We will check correlation
sns.heatmap(data2.corr(),annot=True)

# So no correlation hence no features should be 

# Model Creation

In [None]:
# Step 1 Creating independent and dependent variable.

X=final_df.iloc[:,:-1]
y=final_df.Outcome

# Step 2 Creating training and testing data.
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=45)

y_test.shape

# Step 3 Model creation
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()

clf.fit(X_train,y_train)  ## training

# Step 4 Prediction
y_pred=clf.predict(X_test)

y_pred
y_pred_prob=clf.predict_proba(X_test)
y_pred_prob

data.Outcome.value_counts()

# Evaluation of Model

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score, precision_score,classification_report,f1_score
cm=confusion_matrix(y_test,y_pred)
print(cm)

pd.crosstab(y_test,y_pred)

Acc = accuracy_score(y_test,y_pred)
Acc

recall=recall_score(y_test,y_pred)
recall

precision=precision_score(y_test,y_pred)
precision

f1score=f1_score(y_test,y_pred)
f1score

cr=classification_report(y_test,y_pred)
print(cr)

## Testing the model
y_test.value_counts()