## inspired by https://pbpython.com/categorical-encoding.html

In [231]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

from skmine.preprocessing import SLIMTransformer

In [232]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [233]:
import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df = df.dropna(how="any", axis=0)
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
8,1,158.0,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
10,2,192.0,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101.0,5800.0,23,29,16430.0


In [234]:
y = df["price"]

In [235]:
cats = df.select_dtypes(include=["object"])
cats.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
6,audi,gas,std,four,sedan,fwd,front,ohc,five,mpfi
8,audi,gas,turbo,four,sedan,fwd,front,ohc,five,mpfi
10,bmw,gas,std,two,sedan,rwd,front,ohc,four,mpfi


In [236]:
def tolist(df):
    l = list()
    for line in df.itertuples(index=False, name=None):
        l.append(list(zip(df.columns, line)))
    return l

In [212]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [224]:
st = SLIMTransformer(k=30)
X_cats = st.fit_transform(list(tolist(cats)))
X_cats.isna().sum().sum()

could not discover 30 itemsets. Early stopped


0

In [214]:
conts = df.select_dtypes("number").drop("price", axis=1)
conts.head()

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22
6,1,158.0,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110.0,5500.0,19,25
8,1,158.0,105.8,192.7,71.4,55.9,3086,131,3.13,3.4,8.3,140.0,5500.0,17,20
10,2,192.0,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101.0,5800.0,23,29


In [225]:
X = np.concatenate([X_cats, conts], axis=1)
X.shape

(159, 45)

In [229]:
lr = LinearRegression()
cross_val_score(lr, conts, y, scoring="neg_mean_absolute_error").mean()

-2493.8059488169665

### pipeline

In [74]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline

In [113]:
cat_pipe = Pipeline([
        ("to_list_of_list", FunctionTransformer(tolist)), 
        ("mdl_vect", SLIMTransformer(k=15)),
])
preprocessor = make_column_transformer(
    (cat_pipe, df.dtypes[df.dtypes == object].index.tolist()),
    #(OrdinalEncoder(), ['aspiration']),
    remainder="passthrough"
)

In [114]:
preprocessor.fit_transform(df).shape

(201, 31)

In [110]:
pipe = make_pipeline(preprocessor, LinearRegression())

## TODO
1. take linear regression with only continuous variables as a baseline
2. try different strategies of covering
  * standard cover order
  * standard candidate order
3. discuss interpretability
