In [1]:
# Pandas DataFrames make manipulating your data easy, from selecting or replacing columns and indices to 
# reshaping your data.
# It helps with dealing with input data in CSV formats and with transforming your data into a form where it
# can be inputted into ML models
# We covers Pandas DataFrames, from basic manipulations to advanced operations



#Import module
import pandas as pd

#Loading the dataset
# dataset is in a CSV file, and the function we're going to use to read in the file is called pd.read_csv(). 
# This function returns a dataframe variable

df = pd.read_csv('/media/shashwat/33EFB20D0F6A6ECB/python presentation/diabetes - diabetes_null.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,5,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,4.0,35.0,168.0,43.1,2.288,33,1


In [2]:
df.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,11.0,76.0,48.0,18.0,32.9,0.171,63,0
764,2,122.0,7.0,27.0,,36.8,0.34,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,3,0
766,1,126.0,6.0,,,3.1,0.349,47,1
767,1,93.0,7.0,31.0,,3.4,0.315,23,0


In [4]:
df.shape

(768, 9)

In [5]:
# We can also extract all the column names as a list, by using the columns attribute and can extract the rows
# with the index attribute
df.columns.tolist()

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [6]:
# In order to get a better idea of the type of data that we are dealing with, 
# we can call the describe() function to see statistics like mean, min, etc about each column of the dataset.
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,99.310616,55.706685,25.876155,105.659898,29.865654,0.490439,30.990885,0.348958
std,3.369578,55.115381,29.878852,12.967816,116.862508,10.686049,0.34605,13.281475,0.476951
min,0.0,1.0,1.0,1.0,1.0,2.0,0.1,3.0,0.0
25%,1.0,72.5,16.0,18.0,21.0,25.4,0.25375,23.0,0.0
50%,3.0,113.0,66.0,27.0,71.0,32.0,0.3825,28.0,0.0
75%,6.0,138.0,76.0,35.0,151.0,36.1,0.6475,38.25,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
# The function max() will show you the maximum values of all columns
df.max()

Pregnancies                  17.00
Glucose                     199.00
BloodPressure               122.00
SkinThickness                99.00
Insulin                     846.00
BMI                          67.10
DiabetesPedigreeFunction      2.42
Age                          81.00
Outcome                       1.00
dtype: float64

In [9]:
# Then, if you'd like to specifically get the max value for a particular column, 
# you pass in the name of the column using the bracket indexing operator
df['BMI'].max()

67.1

In [11]:
#If we need to calculate the mean of a particular column
df['BMI'].mean()

29.86565389696167

In [12]:
#what if we want to know the index of row of the maximum value present in a particular column
#for that purpose we use argmax()
df['BMI'].argmax()

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  This is separate from the ipykernel package so we can avoid doing imports until


177

In [13]:
# value_counts() function shows how many times each item appears in the column
df['BMI'].value_counts()

32.0    13
31.6    12
31.2    12
33.3    10
32.4    10
32.9     9
32.8     9
3.1      9
3.8      9
33.6     8
34.2     8
29.7     8
25.9     7
27.8     7
35.5     7
3.4      7
27.6     7
28.7     7
3.0      7
33.2     7
3.5      7
39.4     7
25.2     6
38.5     6
24.2     6
28.9     6
34.3     6
28.4     6
34.9     6
32.5     6
        ..
42.8     1
46.7     1
67.1     1
39.6     1
3.7      1
38.3     1
44.6     1
36.7     1
22.3     1
45.4     1
45.7     1
24.5     1
46.5     1
33.5     1
32.2     1
21.7     1
59.4     1
21.2     1
32.6     1
4.2      1
43.1     1
49.6     1
36.2     1
26.3     1
4.8      1
19.3     1
46.3     1
49.3     1
3.2      1
57.3     1
Name: BMI, Length: 247, dtype: int64

In [15]:
ACCESSING 

# Select an Index or Column From a Pandas DataFrame
# Selecting data by row numbers (.iloc)
# Selecting data by label or by a conditional statment (.loc)

print(df.iloc[1][])



85.0


In [17]:
print(df.loc[1]['Glucose'])

85.0


In [18]:
df.iloc[:3]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,5,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1


In [19]:
#if we want the dataset with Glucose as sorted
df.sort_values('Glucose').head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
15,7,1.0,,,,3.0,0.484,32,1
446,1,1.0,72.0,12.0,7.0,25.3,0.658,28,0
57,0,1.0,88.0,6.0,11.0,46.8,0.962,31,0
454,2,1.0,54.0,28.0,15.0,37.8,0.498,24,0
639,1,1.0,74.0,12.0,46.0,19.5,0.149,28,0


In [21]:
#filtering data
df[df['Age'] > 30]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
4,0,137.0,4.0,35.0,168.0,43.1,2.288,33,1
8,2,197.0,7.0,45.0,543.0,3.5,0.158,53,1
9,8,125.0,96.0,,,,0.232,54,1
11,10,168.0,74.0,,,38.0,0.537,34,1
12,10,139.0,8.0,,,27.1,1.441,57,0
13,1,189.0,6.0,23.0,846.0,3.1,0.398,59,1
14,5,166.0,72.0,19.0,175.0,25.8,0.587,51,1
15,7,1.0,,,,3.0,0.484,32,1
