In [None]:
#  !pip install dfply
# installed dfply in Settings
import pandas as pd
import pandas_profiling
import seaborn as sns
from plotnine import *
from dfply import *
import category_encoders as ce
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer
from sklearn.metrics import mean_absolute_error
import scikitplot as skplt


import warnings; warnings.filterwarnings('ignore') # Suppress warnings 

In [None]:
full_train_set = (
    pd.read_csv("../input/tips.csv") >> 
    distinct() >> # the dataset has a duplicate row
    rename(target = X.tip) 
)
full_train_set['rn'] = full_train_set.reset_index().index # add a row number
full_train_set >> head()

In [None]:
pandas_profiling.ProfileReport(full_train_set).to_file("initial.html")

In [None]:
sns.lmplot(x="size", y="target", data=full_train_set, x_estimator=np.mean);

In [None]:
sns.lmplot(x="total_bill", y="target", hue="sex", data=full_train_set,
           markers=["o", "x"], palette="Set1");

In [None]:
sns.lmplot(x="total_bill", y="target", hue="day", data=full_train_set);

In [None]:
X = (full_train_set >> drop(X.target,X.rn))     
y = full_train_set.target

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.75, random_state=1)

In [None]:
def dummy(train, test):
    train = pd.get_dummies(train, 
                    dummy_na=True, 
                    drop_first=True)
    test = pd.get_dummies(test, 
                    dummy_na=True, 
                    drop_first=True)
    train, test = train.align(test,join='inner',axis=1)
    return(train, test)


X_train, X_val = dummy(train = X_train, test = X_val)


X_train >> head()

In [None]:
preprocess = make_column_transformer(
    (list(X_train >> drop(X.total_bill)), make_pipeline(PowerTransformer(),RobustScaler())),
    (list(X_train >> select(X.total_bill)), make_pipeline(PowerTransformer(),RobustScaler())),
    remainder='passthrough'
)

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("alg",ExtraTreesRegressor(random_state = 9))
])

# fit model to train and predict on validation
model_fit = pipeline.fit(X_train, y_train)
preds = model_fit.predict(X_val)

(mean_absolute_error(y_val, preds))

In [None]:
skplt.estimators.plot_feature_importances(
    pipeline.named_steps['alg'],
    x_tick_rotation = 90,
    feature_names=list(X_train)
)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

eli5.show_weights(
    PermutationImportance(pipeline.named_steps['alg']).fit(X_val, y_val), 
    feature_names = list(X_val >> select(everything()))
)

In [None]:
from yellowbrick.regressor import PredictionError

visualizer = PredictionError(model_fit)
visualizer.fit(X_train, y_train)
visualizer.score(X_val, y_val)
visualizer.poof()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

preprocess = make_column_transformer(
    (list(X_train >> drop(X.total_bill)), make_pipeline(PowerTransformer(),RobustScaler())),
    (list(X_train >> select(X.total_bill)), make_pipeline(PowerTransformer(),RobustScaler())),
    remainder='passthrough'
)
 
pipeline = Pipeline([
    ("preprocess", preprocess),
    ("alg",GradientBoostingRegressor(random_state = 9))
])

# fit model to train and predict on validation
model_fit = pipeline.fit(X_train, y_train)
preds = model_fit.predict(X_val)

(mean_absolute_error(y_val, preds))

In [None]:
skplt.estimators.plot_feature_importances(
    pipeline.named_steps['alg'],
    x_tick_rotation = 90,
    feature_names=list(X_train)
)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

eli5.show_weights(
    PermutationImportance(pipeline.named_steps['alg']).fit(X_val, y_val), 
    feature_names = list(X_val >> select(everything()))
)

In [None]:
from yellowbrick.regressor import PredictionError

visualizer = PredictionError(model_fit)
visualizer.fit(X_train, y_train)
visualizer.score(X_val, y_val)
visualizer.poof()

In [None]:
from sklearn import svm

preprocess = make_column_transformer(
    (list(X_train >> drop(X.total_bill)), make_pipeline(PowerTransformer(),RobustScaler())),
    (list(X_train >> select(X.total_bill)), make_pipeline(PowerTransformer(),RobustScaler())),
    remainder='passthrough'
)
 
pipeline = Pipeline([
    ("preprocess", preprocess),
    ("alg",svm.SVR())
])

# fit model to train and predict on validation
model_fit = pipeline.fit(X_train, y_train)
preds = model_fit.predict(X_val)

(mean_absolute_error(y_val, preds))

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

eli5.show_weights(
    PermutationImportance(pipeline.named_steps['alg']).fit(X_val, y_val), 
    feature_names = list(X_val >> select(everything()))
)

In [None]:
from yellowbrick.regressor import PredictionError

visualizer = PredictionError(model_fit)
visualizer.fit(X_train, y_train)
visualizer.score(X_val, y_val)
visualizer.poof()

In [None]:
from mlxtend.regressor import StackingRegressor


# Initializing models
sv = svm.SVR()
gbr = GradientBoostingRegressor()
etr = ExtraTreesRegressor()

stregr = StackingRegressor(regressors=[gbr, etr, sv],meta_regressor=etr)


# fit model to train and predict on validation
model_fit = stregr.fit(X_train, y_train)
preds = model_fit.predict(X_val)

(mean_absolute_error(y_val, preds))


In [None]:
X_val >>= mutate(
    preds = model_fit.predict(X_val),
    y_val = y_val
)

sns.jointplot(x="y_val", y="preds", data=X_val, kind="reg");

In [None]:
(ggplot(X_val)
    + aes(x = 'y_val', y = 'preds')
    + geom_point(alpha = .3)
    + geom_abline(alpha = .5)
    + geom_smooth(alpha = .5)
    + theme_gray()
    + labs(title = "Prediction Error")
).draw();



In [None]:
# Plot the residuals after fitting a linear model
sns.residplot(y_val, preds, lowess=True, color="g", data=X_val)