## Handling missing Values using sklearn

In [161]:
!pip install -q icecream
# To print variable name along with the contents
from icecream import ic

# Configure the output to include variable names
ic.configureOutput(prefix='', includeContext=False)

In [162]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [163]:
df = pd.read_csv('/content/supermarket_sales_frac_missing_val.csv').drop(columns=['Unnamed: 0'])

In [164]:
df

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,451-28-5717,C,Naypyitaw,,,,83.17,6.0,24.9510,523.9710,3/20/2019,11:23,Cash,499.02,4.761905,24.9510,7.3
1,137-63-5492,C,Naypyitaw,,Male,,,10.0,29.3800,616.9800,1/29/2019,14:26,Ewallet,587.60,4.761905,29.3800,9.0
2,733-29-1227,C,Naypyitaw,Normal,Male,Home and lifestyle,55.61,7.0,19.4635,408.7335,3/23/2019,12:41,Cash,389.27,4.761905,19.4635,8.5
3,322-02-2271,B,Mandalay,,Female,Sports and travel,,3.0,6.4455,135.3555,2/3/2019,11:46,,128.91,4.761905,6.4455,9.3
4,569-71-4390,B,Mandalay,Normal,Male,,,2.0,2.1870,45.9270,1/25/2019,14:29,Ewallet,43.74,4.761905,2.1870,6.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,257-73-1380,C,Naypyitaw,,Male,,82.93,,16.5860,348.3060,1/20/2019,16:51,,331.72,4.761905,16.5860,9.6
96,787-87-2010,A,Yangon,Member,Male,Health and beauty,55.50,,11.1000,233.1000,1/20/2019,15:48,Credit card,222.00,4.761905,11.1000,6.6
97,504-35-8843,A,Yangon,Normal,Male,Sports and travel,42.47,,2.1235,44.5935,1/2/2019,16:57,,42.47,4.761905,2.1235,5.7
98,199-75-8169,A,Yangon,Member,Male,Sports and travel,15.81,,7.9050,166.0050,3/6/2019,12:27,Credit card,158.10,4.761905,7.9050,8.6


In [165]:
impute = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [166]:
impute.fit(df)

In [167]:
imputed_arr = impute.transform(df)

Show and explain the documentation for simple imputer


*   fit()
*   fit_transform()
* transform()



https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer

In [168]:
imputed_df = pd.DataFrame(imputed_arr, columns=df.columns)

In [169]:
imputed_df

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,451-28-5717,C,Naypyitaw,Normal,Male,Home and lifestyle,83.17,6.0,24.951,523.971,3/20/2019,11:23,Cash,499.02,4.761905,24.951,7.3
1,137-63-5492,C,Naypyitaw,Normal,Male,Home and lifestyle,10.99,10.0,29.38,616.98,1/29/2019,14:26,Ewallet,587.6,4.761905,29.38,9.0
2,733-29-1227,C,Naypyitaw,Normal,Male,Home and lifestyle,55.61,7.0,19.4635,408.7335,3/23/2019,12:41,Cash,389.27,4.761905,19.4635,8.5
3,322-02-2271,B,Mandalay,Normal,Female,Sports and travel,10.99,3.0,6.4455,135.3555,2/3/2019,11:46,Ewallet,128.91,4.761905,6.4455,9.3
4,569-71-4390,B,Mandalay,Normal,Male,Home and lifestyle,10.99,2.0,2.187,45.927,1/25/2019,14:29,Ewallet,43.74,4.761905,2.187,6.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,257-73-1380,C,Naypyitaw,Normal,Male,Home and lifestyle,82.93,6.0,16.586,348.306,1/20/2019,16:51,Ewallet,331.72,4.761905,16.586,9.6
96,787-87-2010,A,Yangon,Member,Male,Health and beauty,55.5,6.0,11.1,233.1,1/20/2019,15:48,Credit card,222.0,4.761905,11.1,6.6
97,504-35-8843,A,Yangon,Normal,Male,Sports and travel,42.47,6.0,2.1235,44.5935,1/2/2019,16:57,Ewallet,42.47,4.761905,2.1235,5.7
98,199-75-8169,A,Yangon,Member,Male,Sports and travel,15.81,6.0,7.905,166.005,3/6/2019,12:27,Credit card,158.1,4.761905,7.905,8.6


In [170]:
numerical_features = df.select_dtypes(include=[np.number])
categorical_features = df.select_dtypes(exclude=[np.number])

print('Features not included in either numerical_features or categorical_features:')
set(df.columns).symmetric_difference(set(numerical_features.columns).union(set(categorical_features.columns)))

Features not included in either numerical_features or categorical_features:


set()

### Imputation in one go

In [171]:
numerical_imputer = SimpleImputer(strategy="median")
categorical_imputer = SimpleImputer(strategy="most_frequent")

numerical_imputed_arr = numerical_imputer.fit_transform(numerical_features)
categorical_imputed_arr = categorical_imputer.fit_transform(categorical_features)

numerical_imputed_df = pd.DataFrame(numerical_imputed_arr, columns=numerical_features.columns)
categorical_imputed_df = pd.DataFrame(categorical_imputed_arr, columns=categorical_features.columns)

imputed_df = pd.merge(numerical_imputed_df, categorical_imputed_df, left_index=True, right_index=True)

In [172]:
imputed_df.describe()

Unnamed: 0,Unit price,Quantity,Tax 5%,Total,cogs,gross margin percentage,gross income,Rating
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,53.8466,5.83,15.5509,326.5689,311.018,4.761905,15.5509,6.81
std,23.285683,2.38283,11.51525,241.820249,230.304999,1.249714e-14,11.51525,1.849352
min,10.99,1.0,1.476,30.996,29.52,4.761905,1.476,4.0
25%,40.71,4.75,6.09375,127.96875,121.875,4.761905,6.09375,5.0
50%,52.89,6.0,11.90775,250.06275,238.155,4.761905,11.90775,6.7
75%,64.125,7.0,21.790375,457.597875,435.8075,4.761905,21.790375,8.5
max,99.73,10.0,44.8785,942.4485,897.57,4.761905,44.8785,9.9


In [173]:
%%script echo skipping
!pip install -q ydata_profiling
from ydata_profiling import ProfileReport
profile = ProfileReport(imputed_df, title="Profiling Report")
profile

skipping


## Encoding categorical features

What is N.O.I.R?

https://scikit-learn.org/stable/api/sklearn.preprocessing.html

In [174]:
df = imputed_df.copy(deep=True)
numerical_features = df.select_dtypes(include=[np.number])
categorical_features = df.select_dtypes(exclude=[np.number])

In [175]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [176]:
product_line_df = df[['Product line']]

In [177]:
enc = OrdinalEncoder(dtype=int)
product_line_encoded_arr = enc.fit_transform(product_line_df)

In [178]:
ic(enc.categories_)
ic(enc.n_features_in_)
ic(enc.feature_names_in_)

enc.categories_: [array(['Electronic accessories', 'Fashion accessories',
                        'Food and beverages', 'Health and beauty', 'Home and lifestyle',
                        'Sports and travel'], dtype=object)]
enc.n_features_in_: 1
enc.feature_names_in_: array(['Product line'], dtype=object)


array(['Product line'], dtype=object)

In [179]:
product_line_encoded_df = pd.DataFrame(product_line_encoded_arr, columns=product_line_df.columns)

In [180]:
product_line_encoded_df[['decoded Product line']] = enc.inverse_transform(product_line_encoded_arr)

In [181]:
product_line_encoded_df.head()

Unnamed: 0,Product line,decoded Product line
0,4,Home and lifestyle
1,4,Home and lifestyle
2,4,Home and lifestyle
3,5,Sports and travel
4,4,Home and lifestyle


In [182]:
product_line_df.head()

Unnamed: 0,Product line
0,Home and lifestyle
1,Home and lifestyle
2,Home and lifestyle
3,Sports and travel
4,Home and lifestyle


Unnecessary order required?

In [183]:
# Enter OneHotEncoder


### Transforming floating point features
https://scikit-learn.org/stable/api/sklearn.preprocessing.html

In [184]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(numerical_features)
scaled_df = pd.DataFrame(scaled_features, columns=numerical_features.columns)

scaled_numerical_full_df = pd.merge(numerical_features, scaled_df, suffixes=['_org', '_tr'], right_index=True, left_index=True)



In [185]:
scaled_numerical_full_df.columns = sorted(scaled_numerical_full_df.columns)

In [186]:
scaled_numerical_full_df

Unnamed: 0,Quantity_org,Quantity_tr,Rating_org,Rating_tr,Tax 5%_org,Tax 5%_tr,Total_org,Total_tr,Unit price_org,Unit price_tr,cogs_org,cogs_tr,gross income_org,gross income_tr,gross margin percentage_org,gross margin percentage_tr
0,83.17,6.0,24.9510,523.9710,499.02,4.761905,24.9510,7.3,1.265633,0.071703,0.820430,0.820430,0.820430,8.881784e-16,0.820430,0.266293
1,52.89,10.0,29.3800,616.9800,587.60,4.761905,29.3800,9.0,-0.041288,1.758836,1.206988,1.206988,1.206988,8.881784e-16,1.206988,1.190165
2,55.61,7.0,19.4635,408.7335,389.27,4.761905,19.4635,8.5,0.076110,0.493486,0.341487,0.341487,0.341487,8.881784e-16,0.341487,0.918438
3,52.89,3.0,6.4455,135.3555,128.91,4.761905,6.4455,9.3,-0.041288,-1.193647,-0.794709,-0.794709,-0.794709,8.881784e-16,-0.794709,1.353201
4,52.89,2.0,2.1870,45.9270,43.74,4.761905,2.1870,6.9,-0.041288,-1.615430,-1.166386,-1.166386,-1.166386,8.881784e-16,-1.166386,0.048911
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,82.93,6.0,16.5860,348.3060,331.72,4.761905,16.5860,9.6,1.255274,0.071703,0.090342,0.090342,0.090342,8.881784e-16,0.090342,1.516237
96,55.50,6.0,11.1000,233.1000,222.00,4.761905,11.1000,6.6,0.071363,0.071703,-0.388469,-0.388469,-0.388469,8.881784e-16,-0.388469,-0.114125
97,42.47,6.0,2.1235,44.5935,42.47,4.761905,2.1235,5.7,-0.491028,0.071703,-1.171928,-1.171928,-1.171928,8.881784e-16,-1.171928,-0.603234
98,15.81,6.0,7.9050,166.0050,158.10,4.761905,7.9050,8.6,-1.641705,0.071703,-0.667325,-0.667325,-0.667325,8.881784e-16,-0.667325,0.972783
