In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt
%matplotlib inline

### These gives the description of the columns for better understanding
#### Number of times pregnant
#### Plasma glucose concentration a 2 hours in an oral glucose tolerance test
#### Diastolic blood pressure (mm Hg)
#### Triceps skin fold thickness (mm)
#### 2-Hour serum insulin (mu U/ml)
#### Body mass index (weight in kg/(height in m)^2)
#### Diabetes pedigree function
#### Age (years)
#### Class variable (0 or 1)

In [3]:
dataset = fetch_openml(data_id = '37',as_frame=True)

  warn(


### here we can see that the type of the data is sklearn.utils_bunch

In [4]:
type(dataset)

sklearn.utils._bunch.Bunch

### converting the data from sklearn.utils.bunch to pandas dataframe

In [5]:
df = pd.DataFrame(dataset.data,columns=dataset.feature_names)

In [6]:
type(df) ## here we can see that the data has been converted to dataframe

pandas.core.frame.DataFrame

In [7]:
df['target'] = dataset.target

In [8]:
df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,target
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,tested_positive
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,tested_negative
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,tested_positive
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,tested_negative
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,tested_positive


### Since we don't know any information about the data let's see what are the different datatypes present

In [9]:
df.info()
## here we can see that all the features except the target are all flaot64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   preg    768 non-null    float64 
 1   plas    768 non-null    float64 
 2   pres    768 non-null    float64 
 3   skin    768 non-null    float64 
 4   insu    768 non-null    float64 
 5   mass    768 non-null    float64 
 6   pedi    768 non-null    float64 
 7   age     768 non-null    float64 
 8   target  768 non-null    category
dtypes: category(1), float64(8)
memory usage: 49.0 KB


### let's describe the data and see some basic stats

In [10]:
df.describe()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


### We will see how many unique target elements are present in the data

In [11]:
df['target'].unique()
## here we can see that there are only 2 objects that are test_positive and tested_negative

['tested_positive', 'tested_negative']
Categories (2, object): ['tested_negative', 'tested_positive']

### Now let's convert the Target column into numeric as well and 0=> tested_negative and 1=> tested positive

In [12]:
## this method here is used to convert the target variable into 0 or 1
def target_cov(num):
    if(num=="tested_positive"):
        return 1
    elif(num=="tested_negative"):
        return 0

In [13]:
df['target_num'] = df['target'].apply(target_cov)

In [14]:
df.head()
## here we can see that the new column has been added and the conversion has taken place as we thought

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,target,target_num
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,tested_positive,1
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,tested_negative,0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,tested_positive,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,tested_negative,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,tested_positive,1


### let us remove the old target that is categorical and only keep target_num in our dataset

In [15]:
df.drop(['target'],inplace=True,axis=1)
## here we are dropping the target inplace=True means it will premanently delete it
## and axis=1 represents the column

In [16]:
df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,target_num
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   preg        768 non-null    float64 
 1   plas        768 non-null    float64 
 2   pres        768 non-null    float64 
 3   skin        768 non-null    float64 
 4   insu        768 non-null    float64 
 5   mass        768 non-null    float64 
 6   pedi        768 non-null    float64 
 7   age         768 non-null    float64 
 8   target_num  768 non-null    category
dtypes: category(1), float64(8)
memory usage: 49.0 KB


### From the below description we can see that there are total of 768 count and out of that the most common or
### the category that has the highest frequency is 0=> tested_negative and the frequency of it is 500

In [18]:
df.describe(include='category')

Unnamed: 0,target_num
count,768
unique,2
top,0
freq,500


### i want to make visualizations 
1) age vs class
2) mass vs class
3) predi vs class

In [19]:
pd.to_csv

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,target_num
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,1
