https://towardsdatascience.com/https-medium-com-tirthajyoti-build-pipelines-with-pandas-using-pdpipe-cade6128cd31?source=---------2------------------

https://medium.com/@shashimalsenarath.17/is-pipeline-important-in-machine-learning-pipeline-with-pdpipe-b6fc9acf20c4

In [2]:
# Data Analysis packages
import pandas as pd
import pandas_profiling 
import numpy as np


# Data Visualization packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Other useful packages
from datetime import datetime
import warnings
import os
from tqdm import tqdm, tqdm_notebook
from subprocess import check_output
from pydotplus.graphviz import graph_from_dot_data

# Sklearn API
from sklearn import datasets

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline

# Classification Algo Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
# Regression Algo Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#XGBOOST
from xgboost import XGBClassifier

#Pipeline with Pandas
import pdpipe as pdp

# Stats API
import scipy.stats as stats
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
import statsmodels.formula.api as smf

plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 500) # OR pd.options.display.max_rows = 500
pd.set_option('display.max_columns', 500) # OR pd.options.display.max_columns = 500



In [4]:
df = pd.read_csv("USA_Housing.csv")

In [5]:
round(df.sample(5),2)

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
2029,82858.45,7.7,8.95,5.44,36835.74,2005037.31,"9963 Terri Cove Suite 637\nPort Danielleborough, HI 98045-2788"
4526,69664.08,7.61,5.95,2.0,37012.99,1074646.87,"79506 Tony Plain Apt. 264\nMarcuschester, NC 40536"
3309,69650.59,4.81,8.01,4.25,48017.37,1574582.09,"140 Juarez Hills Suite 432\nValeriefort, SC 83268"
3182,82353.29,6.32,6.77,2.49,31745.92,1459525.38,"610 Dale Station\nWest Williamburgh, KY 98834"
551,75146.8,4.45,8.51,6.1,55577.97,1560746.87,"38973 Jeanette Lock\nShafferville, RI 86582-2504"


In [6]:
df.shape

(5000, 7)

In [7]:
df.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')

In [11]:
round(df.describe().T,2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Avg. Area Income,5000.0,68583.11,10657.99,17796.63,61480.56,68804.29,75783.34,107701.75
Avg. Area House Age,5000.0,5.98,0.99,2.64,5.32,5.97,6.65,9.52
Avg. Area Number of Rooms,5000.0,6.99,1.01,3.24,6.3,7.0,7.67,10.76
Avg. Area Number of Bedrooms,5000.0,3.98,1.23,2.0,3.14,4.05,4.49,6.5
Area Population,5000.0,36163.52,9925.65,172.61,29403.93,36199.41,42861.29,69621.71
Price,5000.0,1232072.65,353117.63,15938.66,997577.14,1232669.38,1471210.2,2469065.59


In [14]:
def size(n):
    if n<=4:
        return 'Small'
    elif 4<n<=6:
        return 'Medium'
    else:
        return 'Big'

In [15]:
df['House_size']=df['Avg. Area Number of Rooms'].apply(size)

In [16]:
round(df.sample(5),2)

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size
321,76192.66,7.3,7.34,6.33,21245.84,1432756.52,"22588 Brian Forge Suite 588\nSouth Michelleside, NE 62469-4467",Big
2517,50847.11,4.97,5.36,4.38,34763.7,398909.51,"6908 King Underpass Suite 740\nEast Thomas, MP 29298",Medium
2235,68596.0,6.55,5.91,2.09,55323.96,1388840.17,"9569 Chloe Ports\nLake Stacyshire, NE 25955-2308",Medium
4528,74452.82,5.11,5.46,4.4,37298.07,1065679.12,"77636 Donald Avenue Apt. 063\nLake Savannahfort, ID 71155-2535",Medium
1123,69044.94,4.31,5.6,4.09,34764.29,1084255.68,"937 Singh Greens\nBakerfort, HI 27291",Medium


### Drop a column

In [17]:
drop_age = pdp.ColDrop('Avg. Area House Age')

In [18]:
df2 = drop_age(df)

In [19]:
round(df2.sample(5))

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size
2382,69654.0,8.0,3.0,27023.0,717273.0,"940 Kelsey Light Suite 026\nBeckstad, GU 27370-7423",Big
1974,60273.0,7.0,4.0,35890.0,1050224.0,"8775 Angela Flats\nWest Jasonchester, CT 06251-9759",Big
2004,70587.0,7.0,4.0,31799.0,976540.0,"287 Fields Falls Suite 212\nWellsside, WY 48741",Big
2629,75768.0,8.0,3.0,27445.0,1379169.0,"0627 Jonathan Spurs Apt. 121\nHarveystad, VI 29375-3726",Big
3797,56348.0,5.0,2.0,23477.0,649224.0,"1629 James Pines\nPort John, LA 49420",Medium


### Chaining stages by adding them up

In [20]:
pipeline = pdp.ColDrop('Avg. Area House Age')
pipeline+= pdp.OneHotEncode('House_size')

In [21]:
df3 = pipeline(df)

In [22]:
round(df3.sample(5))

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
2369,76908.0,8.0,4.0,51630.0,1813856.0,"765 John Falls Apt. 380\nSouth Kathrynshire, IN 61674-7121",0,0
1412,67150.0,8.0,3.0,39918.0,1376970.0,"16913 John Fall\nNew Lisa, NM 04867-0517",0,0
1289,63867.0,9.0,4.0,14523.0,917611.0,"615 Alexander Plains Suite 955\nWest Alisonberg, VA 45700-1303",0,0
1215,76000.0,7.0,4.0,35184.0,1634781.0,"1503 Allison Freeway Apt. 398\nLake Gina, AS 58599-4858",0,0
4005,73428.0,7.0,6.0,42332.0,1447493.0,"20495 Melissa Gardens Suite 101\nNorth Laura, CO 17542-6982",0,0


In [23]:
def price_tag(x):
    if x>250000:
        return 'keep'
    else:
        return 'drop'

In [24]:
pipeline = pdp.ColDrop('Avg. Area House Age')
pipeline+= pdp.OneHotEncode('House_size')
pipeline+=pdp.ApplyByCols('Price',price_tag,'Price_tag',drop=False)

In [25]:
df4 = pipeline(df)

In [26]:
df4.shape

(5000, 9)

In [27]:
round(df4.sample(5),2)

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Price_tag,Address,House_size_Medium,House_size_Small
4182,54465.75,6.38,3.18,7234.96,395523.25,keep,"602 Ferguson Throughway Suite 431\nNew Nathanfurt, IA 99792",0,0
1511,71769.86,6.85,4.18,28173.39,1114169.04,keep,"96301 Allison Falls Suite 515\nNew Courtney, GU 66573-9604",0,0
3792,65643.86,7.1,6.12,48882.49,1265927.36,keep,"79881 Myers Fords\nNorth Larry, WI 13795",0,0
2336,49435.54,6.05,3.34,43442.0,927163.81,keep,"289 Becky Flats Apt. 257\nWest Kara, IL 19071-5078",0,0
2850,59466.87,7.11,3.26,44825.81,1226041.78,keep,"61933 Sarah Landing\nFarleyview, OK 34381-6999",0,0


In [28]:
pipeline = pdp.ColDrop('Avg. Area House Age')
pipeline+= pdp.OneHotEncode('House_size')
pipeline+=pdp.ApplyByCols('Price',price_tag,'Price_tag',drop=False)
pipeline+=pdp.ValDrop(['drop'],'Price_tag')
pipeline+= pdp.ColDrop('Price_tag')

In [29]:
df5 = pipeline(df)

In [30]:
df5.shape

(4990, 8)

In [37]:
print(pipeline)

A pdpipe pipeline:
[ 0]  Drop column Avg. Area House Age
[ 1]  One-hot encode House_size
[ 2]  Applying a function  to column Price.
[ 3]  Drop values drop in column Price_tag
[ 4]  Drop column Price_tag



### Scikit-learn scaling

In [36]:
pipeline_scale = pdp.Scale('StandardScaler',exclude_columns=['House_size_Medium','House_size_Small'])
df6 = pipeline_scale(df5)

AttributeError: module 'pdpipe' has no attribute 'Scale'

In [34]:
df6 = pipeline_scale(df5)

NameError: name 'pipeline_scale' is not defined

In [26]:
round(df6.sample(5),3)

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
2063,2.067,-1.195,-1.379,-0.128,1.366,"06374 Martin Passage\nNew Shawnland, KS 59839-...",1,0
1001,-0.183,0.504,0.84,-1.328,-1.849,"827 Ferguson Isle\nRosebury, AL 61416-3167",0,0
3801,0.044,0.062,1.164,-0.352,0.3,Unit 8410 Box 5521\nDPO AP 20914-6877,0,0
4690,1.528,-0.738,0.071,0.123,0.048,"29600 Garcia Forest Suite 239\nWest Mark, NV 5...",0,0
3797,-1.155,-2.088,-1.443,-1.283,-1.671,"1629 James Pines\nPort John, LA 49420",1,0


### NLTK stages

In [32]:
pipeline_tokenize=pdp.TokenizeWords('Address')

[nltk_data] Downloading package punkt to /Users/rajkgupta/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [33]:
df7 = pipeline_tokenize(df6)

NameError: name 'df6' is not defined

In [29]:
df7.sample(5)

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
2801,-2.16379,-0.161473,-1.32201,0.623847,-0.359488,"[525, Ashley, Course, Lake, Michelleville, ,, ...",0,0
646,-0.182429,-0.023628,-1.573084,0.104469,-0.187608,"[8961, Guerra, Motorway, Stephensburgh, ,, AR,...",0,0
838,-0.569561,0.185986,0.314022,0.532345,-0.206402,"[86533, Gould, Hills, Garciachester, ,, DE, 09...",0,0
4409,-0.320249,0.464002,0.014353,-1.373439,-0.458186,"[089, Smith, Gateway, Suite, 155, East, Christ...",0,0
1983,-0.004935,-0.580628,0.087245,0.836355,-0.598361,"[Unit, 8667, Box, 6237, DPO, AE, 76811-0261]",0,0


In [30]:
def extract_state(token):
    return str(token[-2])

In [31]:
pipeline_state = pdp.ApplyByCols('Address',extract_state,result_columns='State')

In [32]:
df8=pipeline_state(df7)

In [33]:
round(df8.sample(5),3)

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,State,House_size_Medium,House_size_Small
2150,-0.902,0.709,1.156,-1.005,-1.566,AA,0,0
2330,0.042,0.092,-0.698,-0.412,-0.481,GU,0,0
3717,-1.454,0.991,1.011,-0.755,-1.877,CO,0,0
4187,-0.242,0.962,0.865,-0.975,-1.69,WV,0,0
260,-0.701,0.098,0.93,-0.989,-0.345,NV,0,0
