In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
dataset = pd.read_csv('WineQuality.csv')
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
dataset.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     3
total sulfur dioxide    0
density                 0
pH                      0
sulphates               1
alcohol                 0
quality                 0
dtype: int64

In [85]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:,-2:-1]

In [86]:
col_list = list(X.columns)

print(col_list)

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


The Pipeline constructor takes a list of name/estimator pairs defining a sequence of steps. The
last estimator must be transformers (i.e., they must have a fit_transform() method). The names can be
anything you like.


In [87]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


Now let's remove some features using pipeline.

Since Scikit-Learn does not provide any tranformer for Pandas DataFrames, we need to create a simple custom transformer.

In [88]:
#Custom Tranformer
from sklearn.base import BaseEstimator, TransformerMixin

class MyDataframeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, del_features):
        self.features_to_remove = del_features
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(self.features_to_remove, axis=1).copy()
        

In [89]:
#Tranforming using Pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('selector', MyDataframeSelector(['chlorides', 'residual sugar'])),
    ('imputer', Imputer(strategy='median')),
    ('scalar', StandardScaler())
])

In [90]:
X_trans = pipeline.fit_transform(X)

In [94]:
#Pipleine's fit_tranform returns a numpy array, hence we need to reconvert it back to Dataframe
#Removing deleted features from colun list
col_list = [x for x in col_list if x not in ['chlorides', 'residual sugar']]
X_trans_df = pd.DataFrame(X_trans, columns=col_list)

In [95]:
X_trans_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.52836,0.961877,-1.391472,-0.467198,-0.379133,0.558274,1.288643,-0.579292,-0.960246
1,-0.298547,1.967442,-1.391472,0.87263,0.624363,0.028261,-0.719933,0.128879,-0.584777
2,-0.298547,1.297065,-1.18607,-0.08439,0.229047,0.134264,-0.331177,-0.048164,-0.584777
3,1.654856,-1.384443,1.484154,0.107014,0.4115,0.664277,-0.979104,-0.461264,-0.584777
4,-0.52836,0.961877,-1.391472,-0.467198,-0.379133,0.558274,1.288643,-0.579292,-0.960246
