In [1]:
import warnings

In [7]:
#!pip install seaborn
import seaborn as sns

tips_df = sns.load_dataset('tips')
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips_df.isnull().values.any() #Check if NaN value in the dataset
tips_df.isnull().any() #Check if NaN value in the columns
tips_df.isnull().any(axis=1) #Check if NaN value in the rows

0      False
1      False
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242    False
243    False
Length: 244, dtype: bool

In [10]:
import pandas as pd

#Label encoding
from sklearn.preprocessing import LabelEncoder
label_encoding = LabelEncoder()
tips_df.iloc[:,[2,3,4,5]] = tips_df.iloc[:,[2,3,4,5]].apply(label_encoding.fit_transform)
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


In [11]:
#visualizing the encodings in label encoding
label_encoding = LabelEncoder()
col_fit = label_encoding.fit(tips_df["day"])
dict(zip(col_fit.classes_, col_fit.transform(col_fit.classes_)))

{0: 0, 1: 1, 2: 2, 3: 3}

In [12]:
#One Hot encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
oh_encoding = ColumnTransformer([('OneHotEncoding', OneHotEncoder(),  [2,3,4,5])],remainder='passthrough')
tips_df_ohe = oh_encoding.fit_transform(tips_df)
tips_df_ohe

array([[ 1.  ,  0.  ,  1.  , ..., 16.99,  1.01,  2.  ],
       [ 0.  ,  1.  ,  1.  , ..., 10.34,  1.66,  3.  ],
       [ 0.  ,  1.  ,  1.  , ..., 21.01,  3.5 ,  3.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 22.67,  2.  ,  2.  ],
       [ 0.  ,  1.  ,  1.  , ..., 17.82,  1.75,  2.  ],
       [ 1.  ,  0.  ,  1.  , ..., 18.78,  3.  ,  2.  ]])

In [13]:
#Standardization
from sklearn.preprocessing import StandardScaler
zs = StandardScaler()
tips_df_std = zs.fit_transform(tips_df_ohe)
tips_df_std

array([[ 1.34335316e+00, -1.34335316e+00,  7.84789169e-01, ...,
        -3.14711305e-01, -1.43994695e+00, -6.00192629e-01],
       [-7.44405889e-01,  7.44405889e-01,  7.84789169e-01, ...,
        -1.06323531e+00, -9.69205340e-01,  4.53382921e-01],
       [-7.44405889e-01,  7.44405889e-01,  7.84789169e-01, ...,
         1.37779900e-01,  3.63355539e-01,  4.53382921e-01],
       ...,
       [-7.44405889e-01,  7.44405889e-01, -1.27422758e+00, ...,
         3.24629502e-01, -7.22971264e-01, -6.00192629e-01],
       [-7.44405889e-01,  7.44405889e-01,  7.84789169e-01, ...,
        -2.21286504e-01, -9.04025732e-01, -6.00192629e-01],
       [ 1.34335316e+00, -1.34335316e+00,  7.84789169e-01, ...,
        -1.13228903e-01,  1.24660453e-03, -6.00192629e-01]])

In [14]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
tips_df_std = minmax.fit_transform(tips_df_ohe)
tips_df_std

array([[1.        , 0.        , 1.        , ..., 0.29157939, 0.00111111,
        0.2       ],
       [0.        , 1.        , 1.        , ..., 0.1522832 , 0.07333333,
        0.4       ],
       [0.        , 1.        , 1.        , ..., 0.3757855 , 0.27777778,
        0.4       ],
       ...,
       [0.        , 1.        , 0.        , ..., 0.41055718, 0.11111111,
        0.2       ],
       [0.        , 1.        , 1.        , ..., 0.30896523, 0.08333333,
        0.2       ],
       [1.        , 0.        , 1.        , ..., 0.32907415, 0.22222222,
        0.2       ]])

### Summary
1. Seaborn as sns: Used to load the tips dataset.
2. Sklearn.preprocessing: Includes tools like LabelEncoder, OneHotEncoder, StandardScaler, and MinMaxScaler for data preprocessing.
3. sns.load_dataset('tips'): Loads the tips dataset into a DataFrame.
4. DataFrame.isnull(): Checks if there are any missing values in the dataset, across columns and rows.
5. LabelEncoder: Converts categorical variables (sex, smoker, day, time) into numerical labels.
6. OneHotEncoder with ColumnTransformer: Transforms categorical variables into binary columns, representing each category.
7. StandardScaler: Scales the dataset so that each feature has a mean of 0 and a standard deviation of 1.
8. MinMaxScaler: Scales features to a specified range, typically between 0 and 1.