In [1]:
import pandas as pd
import numpy as np
from sklearn import pipeline,preprocessing,metrics,model_selection,ensemble
from sklearn_pandas import DataFrameMapper

In [2]:
data=pd.read_csv('../mpg_data_example.csv')

In [3]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [4]:
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [6]:
mapper = DataFrameMapper([
                        (['cylinders','displacement','weight','acceleration','model year'], preprocessing.StandardScaler()),
                        (['horsepower'],preprocessing.Imputer()),
                        (['origin'], preprocessing.OneHotEncoder())
                        ])



In [7]:
pipeline_obj = pipeline.Pipeline([
    ('mapper',mapper),
    ("model", ensemble.RandomForestRegressor())
])

In [8]:
data.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [9]:
X=['cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin']
Y=['mpg']

In [10]:
pipeline_obj.fit(data[X],data[Y])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  self._final_estimator.fit(Xt, y, **fit_params)


Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[(['cylinders', 'displacement', 'weight', 'acceleration', 'model year'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['horsepower'], Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbo...ators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [11]:
pipeline_obj.predict(data[X])

array([16.9 , 14.7 , 17.2 , 16.3 , 16.6 , 14.9 , 14.1 , 13.8 , 13.6 ,
       14.7 , 14.8 , 14.3 , 14.9 , 15.1 , 24.  , 21.7 , 19.1 , 20.02,
       27.  , 26.45, 24.85, 24.1 , 24.8 , 24.7 , 20.4 , 10.9 , 11.8 ,
       12.  ,  9.7 , 27.  , 24.7 , 24.5 , 24.5 , 19.3 , 16.4 , 17.6 ,
       18.7 , 18.3 , 13.8 , 13.5 , 14.1 , 14.  , 12.1 , 13.  , 12.8 ,
       18.  , 22.65, 18.9 , 18.1 , 22.4 , 27.2 , 30.1 , 29.6 , 31.7 ,
       34.  , 27.65, 26.2 , 23.6 , 25.2 , 22.7 , 21.  , 21.2 , 13.2 ,
       13.5 , 14.7 , 14.2 , 15.9 , 11.5 , 12.7 , 12.5 , 13.1 , 18.9 ,
       14.8 , 13.4 , 13.7 , 14.1 , 18.79, 21.9 , 20.8 , 26.15, 22.25,
       25.1 , 22.7 , 26.8 , 26.3 , 13.3 , 14.5 , 13.2 , 14.15, 14.4 ,
       11.6 , 13.  , 13.4 , 14.  , 12.1 , 11.9 , 13.2 , 18.  , 16.7 ,
       19.  , 18.9 , 22.4 , 26.3 , 11.9 , 12.3 , 12.7 , 12.3 , 19.1 ,
       19.9 , 22.15, 23.9 , 21.6 , 20.1 , 21.1 , 25.5 , 14.2 , 15.  ,
       28.1 , 24.8 , 21.85, 19.65, 15.2 , 23.6 , 20.1 , 12.5 , 20.1 ,
       20.  , 19.1 ,

In [12]:
from sklearn.externals import joblib

In [13]:
joblib.dump(pipeline_obj,'RFModelforMPG.pkl')

['RFModelforMPG.pkl']

In [14]:
modelReload=joblib.load('RFModelforMPG.pkl')

In [15]:
modelReload.predict(data[X])

array([16.9 , 14.7 , 17.2 , 16.3 , 16.6 , 14.9 , 14.1 , 13.8 , 13.6 ,
       14.7 , 14.8 , 14.3 , 14.9 , 15.1 , 24.  , 21.7 , 19.1 , 20.02,
       27.  , 26.45, 24.85, 24.1 , 24.8 , 24.7 , 20.4 , 10.9 , 11.8 ,
       12.  ,  9.7 , 27.  , 24.7 , 24.5 , 24.5 , 19.3 , 16.4 , 17.6 ,
       18.7 , 18.3 , 13.8 , 13.5 , 14.1 , 14.  , 12.1 , 13.  , 12.8 ,
       18.  , 22.65, 18.9 , 18.1 , 22.4 , 27.2 , 30.1 , 29.6 , 31.7 ,
       34.  , 27.65, 26.2 , 23.6 , 25.2 , 22.7 , 21.  , 21.2 , 13.2 ,
       13.5 , 14.7 , 14.2 , 15.9 , 11.5 , 12.7 , 12.5 , 13.1 , 18.9 ,
       14.8 , 13.4 , 13.7 , 14.1 , 18.79, 21.9 , 20.8 , 26.15, 22.25,
       25.1 , 22.7 , 26.8 , 26.3 , 13.3 , 14.5 , 13.2 , 14.15, 14.4 ,
       11.6 , 13.  , 13.4 , 14.  , 12.1 , 11.9 , 13.2 , 18.  , 16.7 ,
       19.  , 18.9 , 22.4 , 26.3 , 11.9 , 12.3 , 12.7 , 12.3 , 19.1 ,
       19.9 , 22.15, 23.9 , 21.6 , 20.1 , 21.1 , 25.5 , 14.2 , 15.  ,
       28.1 , 24.8 , 21.85, 19.65, 15.2 , 23.6 , 20.1 , 12.5 , 20.1 ,
       20.  , 19.1 ,

In [None]:
temp={}
temp['cylinders']=1
temp['displacement']=2
temp['horsepower']=3
temp['weight']=4
temp['acceleration']=5
temp['model year']=6
temp['origin']=1

In [None]:
testDtaa=pd.DataFrame({'x':temp}).transpose()

In [None]:
modelReload.predict(testDtaa)[0]