In [None]:
# Pandas

import pandas as pd
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['mpg', 'cyl', 'disp', 'hp', 'weight', 'acc', 'year', 'origin'] 

df = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)
df

In [None]:
df['mpg']

In [None]:
df['mpg'].values[:5]

In [None]:
df[['mpg', 'hp']]

In [None]:
df[['mpg', 'hp']].values[:5]

In [None]:
df.iloc[:5]

In [None]:
cond = (df['year'] == 70)
df[cond]

In [None]:
# Train-Validation-Test Split: Shuffling

df['year'].values[:50]

In [None]:
shuffled = df.sample(frac=1, random_state=1).reset_index(drop=True)

In [None]:
from sklearn.model_selection import train_test_split

trainval, test = train_test_split(shuffled, test_size=0.16, shuffle=False)
train, val = train_test_split(trainval, test_size=0.2, shuffle=False)

In [None]:
# Cleaning Data

is_missing_attr = train.isna()
n_missing_attr = is_missing_attr.sum(axis=1)
train[n_missing_attr > 0]

In [None]:
train.dropna(inplace=True)
train

In [None]:
val.dropna(inplace=True)
test.dropna(inplace=True)

In [None]:
# Continuous Attributes

cont_attr = ['mpg', 'disp', 'hp', 'weight', 'acc']

In [None]:
train_features = train[cont_attr[1:]]
train_features.hist()

In [None]:
train_means = train_features.mean()
train_standard_deviations = train_features.std()
train_means, train_standard_deviations

In [None]:
train_standardized_features = (train_features - train_means)/train_standard_deviations
train_standardized_features.mean(), train_standardized_features.std()

In [None]:
train_standardized_features.hist()

In [None]:
val_features = val[cont_attr[1:]]
val_standardized_features = (val_features - train_means)/train_standard_deviations
val_standardized_features.mean(), val_standardized_features.std()

In [None]:
test_features = test[cont_attr[1:]]
test_standardized_features = (test_features - train_means)/train_standard_deviations

In [None]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()
scaler.fit(train_features.values)

In [None]:
scaler.mean_, scaler.var_

In [None]:
standardized_data = {}
standardized_data['train'] = scaler.transform(train_features)
standardized_data['val'] = scaler.transform(val_features)
standardized_data['test'] = scaler.transform(test_features)

In [None]:
from sklearn.preprocessing import StandardScaler 

def standardize(df, cont_attr, scaler=None):
    cont_X = df[cont_attr].values
    if scaler is None:
        scaler = StandardScaler()
    scaler.fit(cont_X)
    cont_X = scaler.transform(cont_X)
    cont_X = torch.as_tensor(cont_X, dtype=torch.float32)
    return cont_X, scaler

In [None]:
standardized_data = {}
# The training set is used to fit a scaler
standardized_data['train'], scaler = standardize(train_features, cont_attr[1:])
# The scaler is used as argument to the other datasets
standardized_data['val'], _ = standardize(val_features, cont_attr[1:], scaler)
standardized_data['test'], _ = standardize(test_features, cont_attr[1:], scaler)

In [None]:
# Discrete and Categorical Attributes

cyls = sorted(train['cyl'].unique())
cyls

In [None]:
cyls_map = dict((v, i) for i, v in enumerate(cyls))
cyls_map

In [None]:
n_dim = 8
lookup_table = torch.randn((len(cyls), n_dim))
lookup_table

In [None]:
idx = cyls_map[6]
lookup_table[idx]

In [None]:
import torch.nn as nn 

emb_table = nn.Embedding(len(cyls), n_dim)

In [None]:
idx = cyls_map[6]
emb_table(torch.as_tensor([idx]))

In [None]:
ohe_table = torch.eye(len(cyls))
ohe_table

In [None]:
idx = cyls_map[6]
ohe_table[idx]

In [None]:
from sklearn.preprocessing import OrdinalEncoder
disc_attr = ['cyl', 'year', 'origin'] 

encoder = OrdinalEncoder()
encoder.fit(train[disc_attr])

In [None]:
encoder.categories_

In [None]:
train_cat_features = encoder.transform(train[disc_attr])
train_cat_features[:5]

In [None]:
train[disc_attr].iloc[0]

In [None]:
def encode(df, cat_attr, encoder=None):
    cat_X = df[cat_attr].values
    if encoder is None:
        encoder = OrdinalEncoder()
        encoder.fit(cat_X)
    cat_X = encoder.transform(cat_X)
    cat_X = torch.as_tensor(cat_X, dtype=torch.int)
    return cat_X, encoder

In [None]:
cat_data = {}
cat_data['train'], encoder = encode(train, disc_attr)
cat_data['val'], _ = encode(val, disc_attr, encoder)
cat_data['test'], _ = encode(test, disc_attr, encoder)

In [None]:
emb_table(cat_data['train'][:, 0]) # cylinders is the first (zero) column