## A Simple Pipeline in machine learning quick revesion

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df = sns.load_dataset("tips")

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.time.unique(), df.day.unique()

(['Dinner', 'Lunch']
 Categories (2, object): ['Lunch', 'Dinner'],
 ['Sun', 'Sat', 'Thur', 'Fri']
 Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun'])

In [5]:
df.isnull().sum() , df.info(), df.describe() , df.isna().sum() , df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


(total_bill    0
 tip           0
 sex           0
 smoker        0
 day           0
 time          0
 size          0
 dtype: int64,
 None,
        total_bill         tip        size
 count  244.000000  244.000000  244.000000
 mean    19.785943    2.998279    2.569672
 std      8.902412    1.383638    0.951100
 min      3.070000    1.000000    1.000000
 25%     13.347500    2.000000    2.000000
 50%     17.795000    2.900000    2.000000
 75%     24.127500    3.562500    3.000000
 max     50.810000   10.000000    6.000000,
 total_bill    0
 tip           0
 sex           0
 smoker        0
 day           0
 time          0
 size          0
 dtype: int64,
 total_bill     float64
 tip            float64
 sex           category
 smoker        category
 day           category
 time          category
 size             int64
 dtype: object)

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [7]:
df.time = encoder.fit_transform(df.time)
# df.day = encoder.fit_transform(df.day)
df.time.unique(), df.day.unique()

(array([0, 1]),
 ['Sun', 'Sat', 'Thur', 'Fri']
 Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun'])

In [8]:
x = df.drop(labels="time", axis=1)
y = df.time

In [9]:
df.day.value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [13]:
categorical_col = ['sex' , 'smoker' , 'day']
numerical_col = ['total_bill' , 'tip' , 'size']

### Creating a basic pipeline for the preprocessing of numerical and categorical verables

In [16]:
## Defining the pipeline for numerical columns

numerical_pipeline = Pipeline(
    steps =[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


## Defining the pipeline for categorical columns
categorical_pipeline = Pipeline(
    steps =[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])



In [17]:
## This is the final preprocessor which will be used in the model

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_col),
        ('cat', categorical_pipeline, categorical_col)
    ]
)

In [20]:
preprocessor

In [21]:
type(preprocessor)

sklearn.compose._column_transformer.ColumnTransformer

In [22]:
## Now we will fit the preprocessor on the training data and transform the training and testing data

x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import  DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [24]:
models = {
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'LogisticRegression': LogisticRegression()
}

In [25]:
## This function will fit the models on the training data and return the scores of the models on the testing data

def fit_and_score(models, x_train, x_test, y_train, y_test):
    model_scores = {}
    for name, model in models.items():
        model.fit(x_train, y_train)
        model_scores[name] = model.score(x_test, y_test)
    return model_scores

In [26]:
fit_and_score(models, x_train, x_test, y_train, y_test)

{'RandomForest': 0.9591836734693877,
 'DecisionTree': 0.9387755102040817,
 'LogisticRegression': 1.0}

In [27]:
params = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 20, 30, 40, 50],
            'max_depth': [5, 10, 15, 20, 25]
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [5, 10, 15, 20, 25]
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [1, 10, 100, 1000]
        }
    }
}

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

cv = GridSearchCV(RandomForestClassifier(), params['RandomForest']['params'], cv=5, return_train_score=False)
cv.fit(x_train, y_train)

In [29]:
cv.best_params_

{'max_depth': 5, 'n_estimators': 10}