In [11]:
from sklearn import datasets
import pandas as pd
diabetes = datasets.load_diabetes()

In [12]:
diabetes_df = pd.DataFrame(columns=diabetes['feature_names'], data=diabetes["data"].tolist())

In [13]:
diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [18]:
diabetes_df.sex_str = diabetes_df["sex"].apply(lambda x: "F" if x <0 else "M")

In [23]:
#Add prefix to the index!! not the values
diabetes_df.sex_str.add_prefix("G=")

G=0      M
G=1      F
G=2      M
G=3      F
G=4      F
        ..
G=437    M
G=438    M
G=439    M
G=440    F
G=441    F
Name: sex, Length: 442, dtype: object

In [24]:
#Add sufix
diabetes_df.sex_str.add_suffix("_G")

0_G      M
1_G      F
2_G      M
3_G      F
4_G      F
        ..
437_G    M
438_G    M
439_G    M
440_G    F
441_G    F
Name: sex, Length: 442, dtype: object

In [25]:
#Agregate values with function. Even custom functions!
diabetes_df.sex.agg(['min', 'max'])

min   -0.044642
max    0.050680
Name: sex, dtype: float64

In [27]:
diabetes_df.sex_str.agg(['mode']) #Also for string variables

Unnamed: 0,mode
0,F


In [None]:
# The first lower function returns a data series, that's why it needs to have a new .str before calling the string function contains
WDI_ids[WDI_ids['Topic'].str.lower().str.contains('environment')]

In [29]:
df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})

df.all()

col1     True
col2    False
dtype: bool

In [30]:
df.col1.all() #All returns true if all elements are True, False otherwise

True

In [32]:
print(df.col2.all())
print(df.col2.any()) #Any returns true if at least one value is True

False
True


In [38]:
#Return max or min index of value in series
print(diabetes_df.bmi.argmax())
print(diabetes_df.bmi.argmin())

367
281


In [41]:
#Autocorrelation!
diabetes_df.bmi.autocorr(lag=5)

0.13339573006508365

In [47]:
#Clip values with lower and upper limits
diabetes_df.bmi.clip(lower=0,upper = 0.05)

0      0.050000
1      0.000000
2      0.044451
3      0.000000
4      0.000000
         ...   
437    0.019662
438    0.000000
439    0.000000
440    0.039062
441    0.000000
Name: bmi, Length: 442, dtype: float64

In [49]:
#Correlation with another series
diabetes_df.bmi.corr(diabetes_df.sex)

0.08816139902276228

In [50]:
#Count number of non Null
diabetes_df.bmi.count()

442

In [52]:
diabetes_df.bmi

0      0.061696
1     -0.051474
2      0.044451
3     -0.011595
4     -0.036385
         ...   
437    0.019662
438   -0.015906
439   -0.015906
440    0.039062
441   -0.073030
Name: bmi, Length: 442, dtype: float64

In [51]:
#Cumulative sum of the series
diabetes_df.bmi.cumsum()

0      6.169621e-02
1      1.022215e-02
2      5.467336e-02
3      4.307834e-02
4      6.693652e-03
           ...     
437    6.578068e-02
438    4.987441e-02
439    3.396815e-02
440    7.303030e-02
441   -3.556044e-13
Name: bmi, Length: 442, dtype: float64

In [54]:
#Dot product between series!!
diabetes_df.bmi.dot(diabetes_df.sex)

0.08816139902277063

In [58]:
#Remove duplicate values by series
diabetes_df.bmi.drop_duplicates()

0      0.061696
1     -0.051474
2      0.044451
3     -0.011595
4     -0.036385
         ...   
406   -0.080575
410   -0.027762
416    0.080019
422    0.077863
436   -0.074108
Name: bmi, Length: 163, dtype: float64

In [64]:
#Duplicated return a series of boolean. True if value is not unique
diabetes_df.bmi[diabetes_df.bmi.duplicated()==True]

8      0.061696
13    -0.001895
19    -0.018062
30     0.044451
42    -0.010517
         ...   
437    0.019662
438   -0.015906
439   -0.015906
440    0.039062
441   -0.073030
Name: bmi, Length: 279, dtype: float64

In [72]:
#Check if values are in list
diabetes_df.sex_str.isin(["M"])

0       True
1      False
2       True
3      False
4      False
       ...  
437     True
438     True
439     True
440    False
441    False
Name: sex, Length: 442, dtype: bool

In [73]:
diabetes_df.sex_str.mode()

0    F
dtype: object