## Setup stuff (don't edit)

In [1]:
# basic imports for libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# model imports
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# evaluation and training imports
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# preprocessing imports
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [None]:
# install required package
!pip install ucimlrepo

In [3]:
# import additional requirements
from ucimlrepo import fetch_ucirepo
import os

In [4]:
# create data folder
if not os.path.exists('data'):
    os.makedirs('data')

In [None]:
# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X_adult = adult.data.features
y_adult = adult.data.targets

# minor preprocessing for the target (classes should only be <=50k and >50k)
y_adult['income'] = y_adult['income'].map({"<=50K.": "<=50K", ">50K.": ">50K",
                                           "<=50K": "<=50K", ">50K": ">50K"})

# drop problematic columns, repetitive columns and ID column
X_adult = X_adult.drop(columns=["race", "fnlwgt", "education-num", "sex"])

# ensure all null values as represented as NaN
X_adult = X_adult.replace("?", np.nan)

In [6]:
# Create dataset for both features and targets
adult_data = pd.concat([X_adult, y_adult], axis=1)

# Section 1: California Housing Dataset

In [7]:
housing_data = pd.read_csv("data/housing.csv")
housing_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


We can get a pretty good overview of the data just looking at it like this, but we can also use `.info()` and `.describe()` on a pandas DataFrame to learn some more about the data, and `.unique()` on a specific column to see all the values (in the case of a categorical feature, to see all the categories).

In [8]:
housing_data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [9]:
housing_data.info()
housing_data["ocean_proximity"].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

This dataset has:
- 8 numerical features: `longitude`, `latitude`, `housing_median_age`, `total_rooms`, `total_bedrooms`, `population`, `households`, and `median_income`
- 1 categorical feature: `ocean_proximity`, with the categories 'NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', and 'ISLAND'
- the target variable: `median_house_value` - since this is a number, this is a _regression_ problem.

Since the numerical features are all drastically bigger or smaller than each other, we will need to apply **scaling** to the data. We also have a categorical feature that we will need to handle with some kind of **encoding**. Also notice that the `total_bedrooms` feature has a lower count than the other features. This means that there are some missing values in that column! We'll need to apply **imputation**.

Let's start by splitting our training and testing data. Drop the target column from the dataset to get `X` and `y`. **Let's also drop `ocean_proximity` and `total_bedrooms`, so we can focus entirely on scaling.**

In [10]:
# TODO: Split housing_data into features (X) and target (y). Also drop ocean_proximity and total_bedrooms from the dataset
#
# HINT: use the pandas .drop() function
# 

X = ...
y = ...

# TODO: Split housing_data into train and test sets. Use a random state of 123. 30% of the data should be in the test set
# Then print out X_train and y_train below
#
# HINT: Make sure that your X_train, X_test... etc variables are in the right order
# ...

We'll use a `KNeighborsRegressor` as our model. This model relies on measuring distances between points, so we should see a huge difference in score once scaling is applied. Before we scale, let's try to use the data as is. Use this documentation link to get familiar with `KNeighborsRegressor`. Feel free to play around with `n_neighbors` to improve the score.

In [11]:
# TODO: Train and score KNeighborsRegressor on the unscaled data
#
# HINT: Make sure that you use the American spelling of "neighbors"
# ...

That initial score is pretty bad! Let's see how much scaling improves it.

## Scaling 
Try both `MinMaxScaler` (documentation [here](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)) and `StandardScaler` ([here](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)) to see if one method is better than the other. Remember to use `fit_transform()` on the _training_ data and `transform()` on the _testing_ data to avoid breaking the golden rule. After scaling the data, try fitting and scoring the model again to see how much it improves.

In [12]:
# TODO: Use MinMaxScaler or StandardScaler to scale the data
#

scaler = ...
X_train_scaled = ...
X_test_scaled = ...

In [13]:
# Since preprocessors generally return NumPy arrays instead of pandas DataFrames, convert to DataFrame for easier reading
# TODO: Uncomment the lines below to see your scaled data as DataFrames
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns.to_list())
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns.to_list())
# X_train_scaled

In [14]:
# TODO: Train and score KNeighborsRegressor on the scaled data
#
# HINT: Make sure that you use the American spelling of "neighbors"
# HINT: Make sure that you're using the scaled X_train and X_test
# ...

Much better!

## ColumnTransformer
We've scaled the data, but we also want to apply encoding and imputation on it. There's a better way to do multiple preprocessing steps on the same dataset - a `ColumnTransformer`! To use the `ColumnTransformer` we need to specify:
- Which preprocessor(s) we want to use (e.g. `StandardScaler`)
- Which column(s) we want a particular preprocessor to modify (e.g. only numerical features)

Fill in the lists below to sort the columns into numerical, categorical, and containing missing values.

In [15]:
# Resetting X, y, X_train, X_test, y_train, and y_test
X = housing_data.drop(columns=["median_house_value"])
y = housing_data["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
# TODO: Fill in the lists below with the appropriate feature names 
# The features are 'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms',
# 'population', 'households', 'median_income', and 'ocean_proximity'
#
# HINT: Features can be in one, many, or none of the lists

numerical_features = ['longitude', ...]
categorical_features = []
missing_values_features = []

## Encoding Categorical Features and Imputation
There are two options to encode categorical features:
- [`OrdinalEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html), which is best for _ordinal categorical features_. It will assign each category a number, e.g. small = 1, medium = 2, large = 3
- [`OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html), which is best for _nominal categorical features_. It will create a new column for each category and use 1s and 0s to represent membership in that category

It is up to you to decide whether `OrdinalEncoder` or `OneHotEncoder` is a better fit for the `ocean_proximity` feature. Try both, by yourself or with a partner, and see which one produces a better score.

## Imputation
We will use [`SimpleImputer`](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html) to fill in the missing values in the `total_bedrooms` column.

## Constructing the ColumnTransformer
Assign one of `scaler`, `ordinal`, `one_hot`, or `imputer` to the `a`, `b`, and `c` variables below to fill in the correct steps of the ColumnTransformer. If you want to use `OrdinalEncoder` for `ocean_proximity`, change `ordered_categories` to be whichever order of categories you think is correct.

In [17]:
# TODO: Assign values to a, b, and c using scaler, ordinal, one_hot, and imputer, e.g. a = scaler
# If you want to use OrdinalEncoder, change ordered_categories to the order you think is correct
#
# HINT: Think about which groups of features are being modified by a, b, and c, and in which order
#

ordered_categories = ['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND']

scaler = StandardScaler()
ordinal = OrdinalEncoder(categories=[ordered_categories], dtype=int)
one_hot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
imputer = SimpleImputer(strategy='most_frequent')

a = ...
b = ...
c = ...

ct = make_column_transformer(
    (make_pipeline (a, b), numerical_features),
    (c, categorical_features))

## Applying the ColumnTransformer
Now use the `ColumnTransformer` with `X_train` and `X_test` exactly as you did with the scalers. Remember to use `fit_transform()` on the _training_ data and `transform()` on the _testing_ data to avoid breaking the golden rule. After preprocessing the data, try fitting and scoring the model again to see how much it improves.

In [18]:
# TODO: Use the ColumnTransformer to apply all the preprocessing steps to X_train and X_test
#

X_train_transformed = ...
X_test_transformed = ...

In [19]:
# Since preprocessors generally return NumPy arrays instead of pandas DataFrames, convert to DataFrame for easier reading
# TODO: Uncomment the lines below to see your preprocessed data as DataFrames
# X_train_transformed = pd.DataFrame(data=X_train_transformed, columns=ct.get_feature_names_out(), index=X_train.index)
# X_test_transformed = pd.DataFrame(data=X_test_transformed, columns=ct.get_feature_names_out(), index=X_test.index)
# X_train_transformed

In [20]:
# TODO: Train and score KNeighborsRegressor on the preprocessed data
#
# HINT: Make sure that you use the American spelling of "neighbors"
# HINT: Make sure that you're using the transformed X_train and X_test
# ...

A small improvement, but an improvement either way!

# Section 2: Census Dataset

In [21]:
adult_data

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,Bachelors,Divorced,Prof-specialty,Not-in-family,0,0,36,United-States,<=50K
48838,64,,HS-grad,Widowed,,Other-relative,0,0,40,United-States,<=50K
48839,38,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,0,0,50,United-States,<=50K
48840,44,Private,Bachelors,Divorced,Adm-clerical,Own-child,5455,0,40,United-States,<=50K


This dataset has the target variable "income", that can take one of two values: <=50k or >50k. It also includes a mix of numerical and categorical features, and may have missing values.

It's your turn to do preprocessing from scratch! Use the code above and scikit-learn documentation to help you.

## Exploratory Data Analysis (EDA)
Use `.info()`, `.describe()`, and `.unique()` to learn more about the data. Identify what preprocessing steps need to be taken.

In [22]:
# TODO: Perform EDA!
# ...

_TODO: Double-click on this cell and write here what you noticed and which preprocessing steps you need to do!_

...

## Splitting Training and Testing Data
Split `adult_data` into `X` and `y`, and further split the data into training and testing sets using `train_test_split`.

In [23]:
# TODO: Split adult_data into features (X) and target (y)
#
# HINT: use the pandas .drop() function
# 

X = ...
y = ...

# TODO: Split adult_data into train and test sets. Use a random state of 123. 30% of the data should be in the test set
# Then print out X_train and y_train below
#
# HINT: Make sure that your X_train, X_test... etc variables are in the right order
# ...

## Identifying Feature Types
Fill in the lists below to sort the columns into numerical, ordinal, nominal/needing one-hot encoding, and containing missing values.

In [24]:
# TODO: Fill in the lists below with the appropriate feature names 
# The features are 'age', 'workclass', 'education', 'marital-status', 'occupation',
# 'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', and 'native-country'
# For any ordinal features, also add a list with the ordered categories
#
# HINT: Features can be in one, many, or none of the lists

ordinal_features = ['education', ...]
one_hot_features = []
numerical_features = []
missing_values_features = []

# Ordered education categories
ordered_education = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad', 'Some-college', 'Assoc-voc', 'Assoc-acdm', 'Bachelors', 'Masters', 'Prof-school', 'Doctorate']

## Making a ColumnTransformer
Design a ColumnTransformer to handle each category of column.

In [25]:
# TODO: Make a ColumnTransformer to preprocess the data
#
# HINT: Think about which groups of features are being modified by which preprocessing steps and in which order
#

ct = ...

## Applying the ColumnTransformer
Now use the `ColumnTransformer` with `X_train` and `X_test`. Remember to use `fit_transform()` on the _training_ data and `transform()` on the _testing_ data to avoid breaking the golden rule. Then, try fitting and scoring a model with the transformed data, and tune the hyperparameters to optimize the score.

In [26]:
# TODO: Use the ColumnTransformer to apply all the preprocessing steps to X_train and X_test
#

X_train_transformed = ...
X_test_transformed = ...

In [27]:
# Since preprocessors generally return NumPy arrays instead of pandas DataFrames, convert to DataFrame for easier reading
# TODO: Uncomment the lines below to see your preprocessed data as DataFrames
# X_train_transformed = pd.DataFrame(data=X_train_transformed, columns=ct.get_feature_names_out(), index=X_train.index)
# X_test_transformed = pd.DataFrame(data=X_test_transformed, columns=ct.get_feature_names_out(), index=X_test.index)
# X_train_transformed

In [None]:
# TODO: Train and score a classification model of your choice on the preprocessed data
# Try DecisionTreeClassifier, LogisticRegression, or KNeighborsClassifier
#
# HINT: Make sure that you're using the transformed X_train and X_test
# ...