A Pandas DataFrame is a two-dimensional NumPy-like array. You can think of it as a table having rows and columns with index to each row.![alt text](image-1.png)

In [35]:
#pip install pandas

In [36]:
import pandas as pd

In [37]:
#upload the dataset
df=pd.read_csv("./datasets/diabetes.csv")

In [38]:
print(df.shape) # (number of rows, no of columns)

(768, 9)


In [39]:
print(df.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [40]:
print(df.head())  # display first 5 rows

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [41]:
print(df.head(8))  # display first 8 rows

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   
5            5      116             74              0        0  25.6   
6            3       78             50             32       88  31.0   
7           10      115              0              0        0  35.3   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
5                     0.201   30        0  
6                     0.248   26        1  

The df.tail() function prints the last five rows,df.tail(8) prints out the last 8 rows

In [42]:
# specific Columns extraction
print(df.Age)

0      50
1      31
2      32
3      21
4      33
       ..
763    63
764    27
765    30
766    47
767    23
Name: Age, Length: 768, dtype: int64


In [43]:
# specific multiple Columns extraction
print(df[['Age','BMI']])

     Age   BMI
0     50  33.6
1     31  26.6
2     32  23.3
3     21  28.1
4     33  43.1
..   ...   ...
763   63  32.9
764   27  36.8
765   30  26.2
766   47  30.1
767   23  30.4

[768 rows x 2 columns]


In [44]:
#Slicing Based on Row Number
print(df[2:4])  # extract 2nd and 3rd row from the dataframe

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   

   DiabetesPedigreeFunction  Age  Outcome  
2                     0.672   32        1  
3                     0.167   21        0  


In [45]:
# use this to extract specific rows (and not a range of rows) using row numbers
print(df.iloc[[2,4,7]]) 

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
2            8      183             64              0        0  23.3   
4            0      137             40             35      168  43.1   
7           10      115              0              0        0  35.3   

   DiabetesPedigreeFunction  Age  Outcome  
2                     0.672   32        1  
4                     2.288   33        1  
7                     0.134   29        0  


In [46]:
#Slicing Based on Row and Column Numbers
print(df.iloc[2:4, 1:4]) # extracts data from 1,2,3 column in 2,3 rows

   Glucose  BloodPressure  SkinThickness
2      183             64              0
3       89             66             23


In [47]:
''' describe() function to get values such as count, mean, standard 
deviation, minimum and maximum, as well as the various quartiles'''
print(df.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [48]:
'''If you simply want to compute the mean in the DataFrame, you can use the 
mean() function, indicating the axis'''
print(df.mean(0)) # 0 means compute the mean for each columns

#If you want to get the mean for each row, set the axis to 1
# print(df.mean(1)) # 1 means compute the mean for each row

Pregnancies                   3.845052
Glucose                     120.894531
BloodPressure                69.105469
SkinThickness                20.536458
Insulin                      79.799479
BMI                          31.992578
DiabetesPedigreeFunction      0.471876
Age                          33.240885
Outcome                       0.348958
dtype: float64


In [49]:
#applying some function and adding column to the dataframe
age_range=lambda x: '>80' if x > 80 else '<80'
# new column added to the dataframe but not to the actual dataset
df['Age_Category'] = df['Age'].apply(age_range) 

In [50]:
print(df.head())
df.shape

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome Age_Category  
0                     0.627   50        1          <80  
1                     0.351   31        0          <80  
2                     0.672   32        1          <80  
3                     0.167   21        0          <80  
4                     2.288   33        1          <80  


(768, 10)

In [32]:
'''The drop() function drops rows by default, but if you want to drop columns 
instead, set the axis parameter to 1 like this'''
print(df.drop('Age_Category', axis=1)) # drop column

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  