In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from copy import copy
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import f1_score
from sklearn import ensemble
from sklearn import svm

In [3]:
df = pd.read_csv('../datasets/airline_passenger_satisfaction.csv').drop('ID',axis=1)

In [4]:
df.drop('Arrival Delay', axis=1, inplace=True)

In [5]:
X = df.iloc[:10000,:-1]
y = df['Satisfaction'].values[:10000]

In [6]:
y = OrdinalEncoder().fit_transform(y.reshape(-1,1)).ravel()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=42,stratify=y)

# Usando somente o pipeline

In [8]:
pre = ColumnTransformer(transformers=[
  ('cat', OneHotEncoder(sparse=False), ['Gender','Customer Type','Type of Travel','Class']),
  ('num', StandardScaler(), [x for x in range(21) if x not in [0,2,3,4]])
], verbose=True)

In [9]:
pipe = Pipeline(steps=[
  ('pre', pre),
  ('est', svm.SVC(random_state=42,C=100,gamma=0.01,kernel='rbf'))
], verbose=True)

In [10]:
pipe.fit(X_train,y_train)

[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[Pipeline] ............... (step 1 of 2) Processing pre, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing est, total=   0.9s


Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(sparse=False),
                                                  ['Gender', 'Customer Type',
                                                   'Type of Travel', 'Class']),
                                                 ('num', StandardScaler(),
                                                  [1, 5, 6, 7, 8, 9, 10, 11, 12,
                                                   13, 14, 15, 16, 17, 18, 19,
                                                   20])],
                                   verbose=True)),
                ('est', SVC(C=100, gamma=0.01, random_state=42))],
         verbose=True)

In [11]:
y_pred = pipe.predict(X_test)

In [12]:
f1_score(y_test, y_pred)

0.9519343493552169

# Usando pipeline com cross_val_score

In [13]:
pipe2 = Pipeline(steps=[
  ('pre', pre),
  ('est', svm.SVC(random_state=42,C=100,gamma=0.01,kernel='rbf'))
], verbose=True)

In [14]:
cross_val_score(pipe2, X_train, y_train, cv=3, scoring='f1')

[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[Pipeline] ............... (step 1 of 2) Processing pre, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing est, total=   0.4s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[Pipeline] ............... (step 1 of 2) Processing pre, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing est, total=   0.4s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[Pipeline] ............... (step 1 of 2) Processing pre, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing est, total=   0.5s


array([0.94909252, 0.9515918 , 0.94478528])

# Usando pipeline com GridSearchCV

In [15]:
pipe3 = Pipeline(steps=[
  ('pre', pre),
  ('est', svm.SVC(random_state=42))
], verbose=True)

In [16]:
res = pipe3.get_params()

In [17]:
type(res)

dict

In [18]:
res.keys()

dict_keys(['memory', 'steps', 'verbose', 'pre', 'est', 'pre__n_jobs', 'pre__remainder', 'pre__sparse_threshold', 'pre__transformer_weights', 'pre__transformers', 'pre__verbose', 'pre__verbose_feature_names_out', 'pre__cat', 'pre__num', 'pre__cat__categories', 'pre__cat__drop', 'pre__cat__dtype', 'pre__cat__handle_unknown', 'pre__cat__sparse', 'pre__num__copy', 'pre__num__with_mean', 'pre__num__with_std', 'est__C', 'est__break_ties', 'est__cache_size', 'est__class_weight', 'est__coef0', 'est__decision_function_shape', 'est__degree', 'est__gamma', 'est__kernel', 'est__max_iter', 'est__probability', 'est__random_state', 'est__shrinking', 'est__tol', 'est__verbose'])

In [19]:
grid_search = GridSearchCV(pipe3, {'est__C': [0.1, 1, 10, 100, 200, 1000],'est__gamma': [1, 0.1, 0.01, 0.001, 0.0001],'est__kernel': ['rbf']}, scoring='f1',cv=3, verbose=3, n_jobs=-1)

In [20]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[Pipeline] ............... (step 1 of 2) Processing pre, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing est, total=   1.2s
[CV 1/3] END est__C=0.1, est__gamma=1, est__kernel=rbf;, score=0.000 total time=   2.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[Pipeline] ............... (step 1 of 2) Processing pre, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing est, total=   1.3s
[CV 2/3] END est__C=0.1, est__gamma=1, est__kernel=rbf;, score=0.000 total time=   2.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[Pipeline] ............... (step 1 of 2) Processing p

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('pre',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(sparse=False),
                                                                         ['Gender',
                                                                          'Customer '
                                                                          'Type',
                                                                          'Type '
                                                                          'of '
                                                                          'Travel',
                                                                          'Class']),
                                                                        ('num',
                                                                         StandardScaler()

In [21]:
y_pred = grid_search.predict(X_test)

In [22]:
f1_score(y_test, y_pred)

0.9519343493552169

# Testando com varios modelos

In [23]:
pred_models = {}
pred_models['svm']    = svm.SVC(random_state=42)
pred_models['rf']     = ensemble.RandomForestClassifier(random_state=42)
pred_models['gb']     = ensemble.GradientBoostingClassifier(random_state=42)

In [24]:
for k,m in pred_models.items():
  pipe = Pipeline(steps=[
    ('pre', pre),
    ('est', copy(m))
  ])
  
  if k == 'rf':
    param_grid = {
      'est__n_estimators': [80,100,120,130,140],
      'est__max_depth': [None,20,50],
      'est__min_samples_split': [2,3,4],
      'est__min_samples_leaf': [1,2,3]}
  
  elif k == 'gb':
    param_grid = {
      'est__learning_rate': [0.05,0.1,0.15,0.2],
      'est__n_estimators': [100,130,140,150],
      'est__max_depth': [2,5,8,10]}
  
  else:
    param_grid = {
      'est__C': [0.1, 1, 10, 100, 1000],
      'est__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
      'est__kernel': ['rbf']}

  grid_search = GridSearchCV(pipe, param_grid, scoring='f1',cv=3, n_jobs=-1, verbose=2)
  
  grid_search.fit(X_train, y_train)

  best_pa = grid_search.best_params_
  y_pred_train = grid_search.predict(X_train)
  y_pred_test = grid_search.predict(X_test)
  train_score = f1_score(y_train, y_pred_train)
  test_score = f1_score(y_test, y_pred_test)

  print(k)
  print(best_pa)
  print(train_score)
  print(test_score)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransform

gb \
best: {'est__learning_rate': 0.15, 'est__max_depth': 8, 'est__n_estimators': 100} \
score_train: 1.0 \
score_test: 0.9611764705882354

rf \
best: {'est__max_depth': None, 'est__min_samples_leaf': 1, 'est__min_samples_split': 4, 'est__n_estimators': 120} \
score_train: 0.99767644496079 \
score_test: 0.9574970484061393

svm \
best: {'est__C': 100, 'est__gamma': 0.01, 'est__kernel': 'rbf'} \
score_train: 0.969165570656145 \
score_test: 0.9519343493552169 \