# Sample pipelining with Pandas using `pdpipe`
#### David Sotunbo, Data Scientist

In [0]:
!pip install pdpipe
import pandas as pd
import numpy as np
import pdpipe as pdp



In [0]:
df = pd.read_csv("train.csv")

In [0]:
round(df.sample(5),2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
522,523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.22,,C
330,331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q
708,709,1,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.55,,S
38,39,0,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,345764,18.0,,S
192,193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19.0,1,0,350046,7.85,,S


In [0]:
df.shape

(891, 12)

In [0]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [0]:
round(df.describe().T,2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.35,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.38,0.49,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.31,0.84,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.7,14.53,0.42,20.12,28.0,38.0,80.0
SibSp,891.0,0.52,1.1,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.38,0.81,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.2,49.69,0.0,7.91,14.45,31.0,512.33


In [0]:
def size(n):
    if n<=18:
        return 'Child'
    elif 18<n<=59:
        return 'Adult'
    else:
        return 'Old'

df['Age_bracket']=df['Age'].apply(size)

In [0]:
round(df.sample(5),2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_bracket
480,481,0,3,"Goodwin, Master. Harold Victor",male,9.0,5,2,CA 2144,46.9,,S,Child
760,761,0,3,"Garfirth, Mr. John",male,,0,0,358585,14.5,,S,Old
568,569,0,3,"Doharr, Mr. Tannous",male,,0,0,2686,7.23,,C,Old
368,369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q,Old
234,235,0,2,"Leyson, Mr. Robert William Norman",male,24.0,0,0,C.A. 29566,10.5,,S,Adult


### Drop a column

In [0]:
drop_age = pdp.ColDrop('Ticket')

In [0]:
df2 = drop_age(df)

In [0]:
round(df2.sample(5))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_bracket
402,403,0,3,"Jussila, Miss. Mari Aina",female,21.0,1,0,10.0,,S,Adult
777,778,1,3,"Emanuel, Miss. Virginia Ethel",female,5.0,0,0,12.0,,S,Child
807,808,0,3,"Pettersson, Miss. Ellen Natalia",female,18.0,0,0,8.0,,S,Child
262,263,0,1,"Taussig, Mr. Emil",male,52.0,1,1,80.0,E67,S,Adult
611,612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,7.0,,S,Old


### Chaining stages by adding them up

In [0]:
pipeline = pdp.ColDrop('Ticket')
pipeline+= pdp.OneHotEncode('Age_bracket')

In [0]:
df3 = pipeline(df)

In [0]:
round(df3.sample(5))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_bracket_Child,Age_bracket_Old
197,198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42.0,0,1,8.0,,S,0,0
432,433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide ...",female,42.0,1,0,26.0,,S,0,0
759,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,86.0,B77,S,0,0
774,775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,23.0,,S,0,0
336,337,0,1,"Pears, Mr. Thomas Clinton",male,29.0,1,0,67.0,C2,S,0,0


In [0]:
df['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [0]:
def fare_range(x):
    if x>7:
        return 'keep'
    else:
        return 'drop'

In [0]:
pipeline = pdp.ColDrop('Ticket')
pipeline+= pdp.OneHotEncode('Age_bracket')
pipeline+=pdp.ApplyByCols('Fare',fare_range,'Fare_range',drop=False)

In [0]:
df4 = pipeline(df)

In [0]:
df4.shape

(891, 14)

In [0]:
round(df4.sample(5),2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Fare_range,Cabin,Embarked,Age_bracket_Child,Age_bracket_Old
784,785,0,3,"Ali, Mr. William",male,25.0,0,0,7.05,keep,,S,0,0
711,712,0,1,"Klaber, Mr. Herman",male,,0,0,26.55,keep,C124,S,0,1
836,837,0,3,"Pasic, Mr. Jakob",male,21.0,0,0,8.66,keep,,S,0,0
340,341,1,2,"Navratil, Master. Edmond Roger",male,2.0,1,1,26.0,keep,F2,S,1,0
603,604,0,3,"Torber, Mr. Ernst William",male,44.0,0,0,8.05,keep,,S,0,0


In [0]:
df4.Fare_range.unique()

array(['keep', 'drop'], dtype=object)

In [0]:
pipeline = pdp.ColDrop('Ticket')
pipeline+= pdp.OneHotEncode('Age_bracket','Sex')
pipeline+=pdp.ApplyByCols('Fare',fare_range,'Fare_range',drop=False)
pipeline+=pdp.ValDrop(['drop'],'Fare_range')
pipeline+= pdp.ColDrop(['Fare','Sex','Cabin'])

In [0]:
df5 = pipeline(df)

In [0]:
df5.dtypes

PassengerId            int64
Survived               int64
Pclass                 int64
Name                  object
Age                  float64
SibSp                  int64
Parch                  int64
Fare_range            object
Embarked              object
Age_bracket_Adult      uint8
Age_bracket_Child      uint8
Age_bracket_Old        uint8
dtype: object

### Scikit-learn scaling

In [0]:
pipeline_scale = pdp.Scale('StandardScaler',exclude_columns=['Name','Fare_range','Embarked'])

In [0]:
df6 = pipeline_scale(df5)

In [0]:
round(df6.sample(5),3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare_range,Embarked,Age_bracket_Adult,Age_bracket_Child,Age_bracket_Old
543,0.39,1.24,-0.367,"Beane, Mr. Edward",0.164,0.413,-0.483,keep,S,0.785,-0.434,-0.535
377,-0.255,-0.806,-1.563,"Widener, Mr. Harry Elkins",-0.179,-0.483,1.969,keep,C,0.785,-0.434,-0.535
198,-0.951,1.24,0.829,"Madigan, Miss. Margaret ""Maggie""",,-0.483,-0.483,keep,Q,-1.274,-0.434,1.869
652,0.814,-0.806,0.829,"Kalvik, Mr. Johannes Halvorsen",-0.591,-0.483,-0.483,keep,S,0.785,-0.434,-0.535
112,-1.286,-0.806,0.829,"Barton, Mr. David John",-0.522,-0.483,-0.483,keep,S,0.785,-0.434,-0.535


### NLTK stages

In [0]:
pipeline_tokenize=pdp.TokenizeWords('Name')

In [0]:
df7 = pipeline_tokenize(df6)

In [0]:
df7.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare_range,Embarked,Age_bracket_Adult,Age_bracket_Child,Age_bracket_Old
328,-0.445711,1.240256,0.828536,"[Goldsmith, ,, Mrs., Frank, John, (, Emily, Al...",0.095602,0.413318,0.743057,keep,S,0.784923,-0.434402,-0.534921
848,1.576217,-0.806285,-0.367161,"[Harper, ,, Rev, ., John]",-0.110331,-0.482897,0.743057,keep,S,0.784923,-0.434402,-0.534921
304,-0.53903,-0.806285,0.828536,"[Williams, ,, Mr., Howard, Hugh, ``, Harry, '']",,-0.482897,-0.483058,keep,S,-1.27401,-0.434402,1.869436
783,1.323476,-0.806285,0.828536,"[Johnston, ,, Mr., Andrew, G]",,0.413318,1.969171,keep,S,-1.27401,-0.434402,1.869436
885,1.720084,-0.806285,0.828536,"[Rice, ,, Mrs., William, (, Margaret, Norton, )]",0.644754,-0.482897,5.647515,keep,Q,0.784923,-0.434402,-0.534921


In [0]:
def extract_state(token):
    return str(token[-3:])

In [0]:
pipeline_state = pdp.ApplyByCols('Name',extract_state,result_columns='Name_token')

In [0]:
df8=pipeline_state(df7)

In [0]:
round(df8.sample(5),3)

Unnamed: 0,PassengerId,Survived,Pclass,Name_token,Age,SibSp,Parch,Fare_range,Embarked,Age_bracket_Adult,Age_bracket_Child,Age_bracket_Old
182,-1.013,-0.806,0.829,"['Clarence', 'Gustaf', 'Hugo']",-1.415,3.102,1.969,keep,S,-1.274,2.302,-0.535
316,-0.492,1.24,-0.367,"['Miriam', 'Sternin', ')']",-0.385,0.413,-0.483,keep,S,0.785,-0.434,-0.535
797,1.378,1.24,0.829,"[',', 'Mrs.', 'Mara']",0.096,-0.483,-0.483,keep,S,0.785,-0.434,-0.535
303,-0.543,1.24,-0.367,"['.', 'Nora', 'A']",,-0.483,-0.483,keep,Q,-1.274,-0.434,1.869
354,-0.345,-0.806,0.829,"[',', 'Mr.', 'Wazli']",,-0.483,-0.483,keep,C,-1.274,-0.434,1.869
