# DiabetesDataAnalysis By Ahmad Raza

In [None]:
## Data Analysis packages
import numpy as np
import pandas as pd

## Data Visualization packages
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('diabetes.csv')

# Data Preprocessing Part 1

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#0 values came in thickness thats not right its a nan value
df['SkinThickness'].value_counts()

In [None]:
#bloodpressure can never be a zero so it is nan values
df['BloodPressure'].value_counts()

In [None]:
#we need to draw kernal density estimation to saw 0 values in all features except outcome feature
int_vars = df.select_dtypes(include = ['int','float'])

fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15,10))
axs = axs.flatten()

for i, var in enumerate (int_vars):
    sns.kdeplot(x=var,data=df,ax=axs[i])

plt.tight_layout()
plt.show()

In [None]:
#there is 0 values in every columns means surely its nan value

In [None]:
#we need to replace that value to nan
df.replace({'Pregnancies':0,
            'Glucose':0,
            'BloodPressure':0,
            'SkinThickness':0,
            'Insulin':0,
            'BMI':0,
            'DiabetesPedigreeFunction':0,
            'Age':0},np.nan,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
#we fill nan values in pregnancies feature using mode because mean or median change the ND shape
df['Pregnancies'] = df['Pregnancies'].fillna(df['Pregnancies'].mode()[0])

#mode used for  best shape of ND normal distribution
df['Glucose'] = df['Glucose'].fillna(df['Glucose'].mode()[0])

#mean and median shrink the shape of ND increase std that why i fill nan values with mode
df['BloodPressure'] = df['BloodPressure'].fillna(df['BloodPressure'].mode()[0])

#mean and median highly shrink the shape of ND increase std that why i fill nan values with mode
df['SkinThickness'] = df['SkinThickness'].fillna(df['SkinThickness'].mode()[0])

#mean and median highly shrink the shape of ND increase std that why i fill nan values with mode
df['Insulin'] = df['Insulin'].fillna(df['Insulin'].mode()[0])

#mean and median highly shrink the shape of ND increase std that why i fill nan values with mode
df['BMI'] = df['BMI'].fillna(df['BMI'].mode()[0])

In [None]:
df.info()

In [None]:
df['Pregnancies'] = df['Pregnancies'].astype('int')
df['Glucose'] = df['Glucose'].astype('int')
df['SkinThickness'] = df['SkinThickness'].astype('int')
df['Insulin'] = df['Insulin'].astype('int')
df['BloodPressure'] = df['BloodPressure'].astype('int')

In [None]:
df.info()

# Exploratary Data Analysis

In [None]:
int_vars = df.select_dtypes(include = ['int','float'])

fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15,10))
axs = axs.flatten()

for i, var in enumerate (int_vars):
    df[var].plot(kind='hist',ax=axs[i])
    axs[i].set_title(var)

plt.tight_layout()
plt.show()

In [None]:
int_vars = ['Pregnancies','BloodPressure','SkinThickness','Age']

fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(15,15))
axs = axs.flatten()

for i, var in enumerate (int_vars):
    sns.countplot(x=var,data=df,ax=axs[i])
    axs[i].set_title(var)

plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x=df['Outcome'],data=df)
plt.show()

In [None]:
#multivariant analysis
int_vars = ['Pregnancies','BloodPressure','SkinThickness','Age']

fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(15,15))
axs = axs.flatten()

for i, var in enumerate (int_vars):
    sns.countplot(x=var,data=df,hue='Outcome',ax=axs[i])
    axs[i].set_title(var)

plt.tight_layout()
plt.show()

In [None]:
int_vars = df.select_dtypes(include = ['int','float'])

fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15,10))
axs = axs.flatten()

for i, var in enumerate (int_vars):
    sns.histplot(x=var,data=df,kde=True,ax=axs[i])
    axs[i].set_title(var)

plt.tight_layout()
plt.show()

In [None]:
#outliers detection
int_vars = df.select_dtypes(include = ['int','float'])

fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15,10))
axs = axs.flatten()

for i, var in enumerate (int_vars):
    sns.boxplot(x=var,data=df,ax=axs[i])
    axs[i].set_title(var)

plt.tight_layout()
plt.show()

In [None]:
#co_relation heatmap
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),fmt='.2g',annot=True)
plt.show()

In [None]:
Features_with_outliers=['Pregnancies', 'BloodPressure', 'SkinThickness', 'Insulin','BMI',
                        'DiabetesPedigreeFunction','Age']

In [None]:
def remove_outliers_iqr(data):
    
    # Calculate the first quartile (Q1) and third quartile (Q3)
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    
    # Calculate the interquartile range (IQR)
    IQR = Q3 - Q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Remove outliers
    data = np.where(data>upper_bound, upper_bound, np.where(data<lower_bound,lower_bound,data))
   
    return data[(data >= lower_bound) & (data <= upper_bound)]


In [None]:
for column in Features_with_outliers:
    df[column] = remove_outliers_iqr(df[column])

In [None]:
#outliers detection
int_vars = df.select_dtypes(include = ['int','float'])

fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15,10))
axs = axs.flatten()

for i, var in enumerate (int_vars):
    sns.boxplot(x=var,data=df,ax=axs[i])
    axs[i].set_title(var)

plt.tight_layout()
plt.show()

In [None]:
X = df.drop(columns=['Outcome'])  # Drop the 'target' column to get the features
y = df['Outcome']  # Scelect only the 'target' column as the target variable

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression()

In [None]:
log_reg.fit(X_train,y_train)

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy:.2f}")