### Import libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
sns.set()

### Load the data

In [None]:
data=pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
data.head()

In [None]:
data.describe()

### Data Cleaning

In [None]:
data.isnull().sum()
#no null value

In [None]:
#except from pregnancies and outcome no other value can have a minimum of zero. So we replce it by their medians
data['Glucose']=data.Glucose.mask(data.Glucose == 0,data['Glucose'].median())
data['BloodPressure']=data.BloodPressure.mask(data.BloodPressure == 0,data['BloodPressure'].median())
data['SkinThickness']=data.SkinThickness.mask(data.SkinThickness == 0,data['SkinThickness'].median())
data['Insulin']=data.Insulin.mask(data.Insulin == 0,data['Insulin'].median())
data['BMI']=data.BMI.mask(data.BMI == 0,data['BMI'].median())
pd.set_option("max_rows", None)
data

In [None]:
data.describe()

In [None]:
# we can see there are outlier in insulin. further confirming it by plotting it
sns.histplot(data['Insulin'])

In [None]:
p=data["Insulin"].quantile(.98)
data2=data[data['Insulin']<p]
sns.histplot(data['Insulin'])

In [None]:
#Now we have removed all the outliers
#Reset Index
data_cleaned=data2.reset_index(drop=True)

In [None]:
data_cleaned.describe(include='all')

### Data Visualization

In [None]:
f,(ax1, ax2, ax3, ax4, ax5, ax6, ax7)=plt.subplots(1,7, sharey=True , figsize=(18,3))
ax1.scatter(data_cleaned['BMI'],data_cleaned["Outcome"])
ax1.set_title("Outcome and BMI")
ax2.scatter(data_cleaned['Insulin'],data_cleaned["Outcome"])
ax2.set_title("Outcome and Insulin")
ax3.scatter(data_cleaned['SkinThickness'],data_cleaned["Outcome"])
ax3.set_title("Outcome and Skin Thickness")
ax4.scatter(data_cleaned['Glucose'],data_cleaned["Outcome"])
ax4.set_title("Outcome and Glucose")
ax5.scatter(data_cleaned['Age'],data_cleaned["Outcome"])
ax5.set_title("Outcome and Age")
ax6.scatter(data_cleaned['BloodPressure'],data_cleaned["Outcome"])
ax6.set_title("Outcome and Blood Pressure")
ax7.scatter(data_cleaned['DiabetesPedigreeFunction'],data_cleaned["Outcome"])
ax7.set_title("Outcome and DPF")

### Declare indep and dep var

In [None]:
target=data_cleaned['Outcome']
inputs=data_cleaned[['BMI','Insulin','SkinThickness','Glucose','Age','BloodPressure','Pregnancies','DiabetesPedigreeFunction']]

### Standardisation

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
Scaler=StandardScaler()
Scaler.fit(inputs)
x_scaled=Scaler.transform(inputs)
x_scaled

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,target,test_size=.1,random_state=42,shuffle=True)

In [None]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

### Regression

In [None]:
x=sm.add_constant(x_train)

In [None]:
x.shape

In [None]:
reg=sm.Logit(y_train,x)
result=reg.fit()

In [None]:
result.summary()

##### The LLR p-value is very small so our accurate  and  Pseudo-R squared value is between .2 and .4 

### Accuracy

In [None]:
#predicted values
y_hat=result.predict()
y_hat

In [None]:
#actual values
np.array(y_train)

In [None]:
result.pred_table()

In [None]:
cm_df=pd.DataFrame(result.pred_table())
cm_df.columns=['predicted 0','Predicted 1']
cm_df=cm_df.rename(index={0:'Actual 0',1:'Actual 1'})
cm_df

In [None]:
cm=np.array(cm_df)
accuracy=(cm[0,0]+cm[1,1])/cm.sum()
accuracy

### Plottting

In [None]:
sns.histplot(y_train-y_hat,kde=True)
plt.title("Residual PDF", size=18)

### Testing

In [None]:
x_test=sm.add_constant(x_test)

In [None]:
def confusion_matrix(data,y_test,model):
    pred_value=model.predict(data)
    bins=np.array([0,.5,1])
    cm=np.histogram2d(y_test,pred_value,bins=bins)[0]
    accuracy=(cm[0,0]+cm[1,1])/cm.sum()
    return cm,accuracy

In [None]:
cm=confusion_matrix(x_test,y_test,result)
cm

In [None]:
print('misclassification rate:'+str((11+6)/59))

##### Our test accuracy is less than train accuracy but the difference is very small so our model is accurate.