In [None]:
import numpy as np #numerical Python
import pandas as pd #data analysis library

In [None]:
df = pd.read_csv('diabetes.csv') #create a dataframe

In [None]:
df.columns #Check for columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
df.dtypes #check for data format

Unnamed: 0,0
Pregnancies,int64
Glucose,int64
BloodPressure,int64
SkinThickness,float64
Insulin,float64
BMI,float64
DiabetesPedigreeFunction,float64
Age,int64
Outcome,int64


In [None]:
df.head() #generate first 5 rows

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35.0,,33.6,0.627,50,1
1,1,85,66,29.0,,26.6,0.351,31,0
2,8,183,64,,,23.3,0.672,32,1
3,1,89,66,23.0,94.0,28.1,0.167,21,0
4,0,137,40,35.0,168.0,43.1,2.288,33,1


In [None]:
len(df) #generate the number of observations

768

In [None]:
df.shape #check for number of rows and columns (rows, columns)

(768, 9)

In [None]:
df.isna().sum() #check for missing values

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,227
Insulin,374
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
df['SkinThickness'].fillna(df['SkinThickness'].mean(), inplace=True) #replace missing values with mean



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['SkinThickness'].fillna(df['SkinThickness'].mean(), inplace=True)


In [None]:

df['SkinThickness'].mean()

29.15341959334565

In [None]:
df.isna().sum()


Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,374
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
df.head(50)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35.0,,33.6,0.627,50,1
1,1,85,66,29.0,,26.6,0.351,31,0
2,8,183,64,29.15342,,23.3,0.672,32,1
3,1,89,66,23.0,94.0,28.1,0.167,21,0
4,0,137,40,35.0,168.0,43.1,2.288,33,1
5,5,116,74,29.15342,,25.6,0.201,30,0
6,3,78,50,32.0,88.0,31.0,0.248,26,1
7,10,115,0,29.15342,,35.3,0.134,29,0
8,2,197,70,45.0,543.0,30.5,0.158,53,1
9,8,125,96,29.15342,,0.0,0.232,54,1


In [None]:
df['Insulin'].fillna(df['Insulin'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Insulin'].fillna(df['Insulin'].mean(), inplace=True)


In [None]:
df.isna().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
df.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35.0,155.548223,33.6,0.627,50,1
1,1,85,66,29.0,155.548223,26.6,0.351,31,0
2,8,183,64,29.15342,155.548223,23.3,0.672,32,1
3,1,89,66,23.0,94.0,28.1,0.167,21,0
4,0,137,40,35.0,168.0,43.1,2.288,33,1
5,5,116,74,29.15342,155.548223,25.6,0.201,30,0
6,3,78,50,32.0,88.0,31.0,0.248,26,1
7,10,115,0,29.15342,155.548223,35.3,0.134,29,0
8,2,197,70,45.0,543.0,30.5,0.158,53,1
9,8,125,96,29.15342,155.548223,0.0,0.232,54,1


In [None]:
classification = {1: 'With Diabetes', 0: 'Without Diabetes'} #create a new column classifying the Outcome columns
df['Diagnosis'] = df['Outcome'].map(classification)

In [None]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Diagnosis
0,6,148,72,35.0,155.548223,33.6,0.627,50,1,With Diabetes
1,1,85,66,29.0,155.548223,26.6,0.351,31,0,Without Diabetes
2,8,183,64,29.15342,155.548223,23.3,0.672,32,1,With Diabetes
3,1,89,66,23.0,94.0,28.1,0.167,21,0,Without Diabetes
4,0,137,40,35.0,168.0,43.1,2.288,33,1,With Diabetes


In [None]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,29.15342,155.548223,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,8.790942,85.021108,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,7.0,14.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,25.0,121.5,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.15342,155.548223,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,155.548223,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0
