In [1]:
# ... your code here ... (import statements)
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import linear_model

## 1. Feature engineering (one-hot encoding and data imputation)

### Read the data from [http://www.stat.wisc.edu/~jgillett/451/data/kaggle_titanic_train.csv](http://www.stat.wisc.edu/~jgillett/451/data/kaggle_titanic_train.csv).
- Retain only these columns: Survived, Pclass, Sex, Age, SibSp, Parch.
- Display the first 7 rows.

These data are described at [https://www.kaggle.com/competitions/titanic/data](https://www.kaggle.com/competitions/titanic/data) (click on 

We evaluate how these strategies can improve model performance by allowing us to use columns with categorical or missing data.)

In [2]:
# ... your code here ...
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/kaggle_titanic_train.csv', engine='python')

feature_names = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
df = df[feature_names]

print('First 7 Rows: ')
print(df.loc[:6,:])

First 7 Rows: 
   Survived  Pclass     Sex   Age  SibSp  Parch
0         0       3    male  22.0      1      0
1         1       1  female  38.0      1      0
2         1       3  female  26.0      0      0
3         1       1  female  35.0      1      0
4         0       3    male  35.0      0      0
5         0       3    male   NaN      0      0
6         0       1    male  54.0      0      0


### Try to train a $k$NN model to predict $y=$ 'Survived' from $X=$ these features: 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch'.
- Use $k = 3$ and the (default) euclidean metric.
- Notice at the bottom of the error message that it fails with the error "ValueError: could not convert string to float: 'male'".
- Comment out your .fit() line so the cell can run without error.

In [3]:
feature_names = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
X = df[feature_names].to_numpy()
y = df['Survived'].to_numpy()

# kNN
k = 3
clf = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
# clf.fit(X, y)

### Try to train again, this time without the 'Sex' feature.
- Notice that it fails because "Input contains NaN".
- Comment out your .fit() line so the cell can run without error.
- Run `X.isna().any()` (where X is the name of your DataFrame of features) to see that
  the 'Age' feature has missing values. (You can see the first missing value in
  the sixth row that you displayed above.)

In [1]:
feature_names = ['Pclass', 'Age', 'SibSp', 'Parch']
X = df[feature_names].to_numpy()
y = df['Survived'].to_numpy()

k = 3
clf = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
# clf.fit(X, y)
df[feature_names].isna().any()

NameError: name 'df' is not defined

### 1d. Train without the 'Sex' and 'Age' features.
- Report accuracy on the training data with a line of the form
  `Accuracy on training data is  0.500` (0.500 may not be correct).

In [5]:
# ... your code here ...
feature_names = ['Pclass', 'SibSp', 'Parch']
X = df[feature_names].to_numpy()
y = df['Survived'].to_numpy()

k = 3
clf = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
clf.fit(X, y)
print(f'Accuracy on training data is {clf.score(X, y):.3}')

Accuracy on training data is 0.664


### 1e.  Use one-hot encoding
to include a binary 'male'  feature made from the 'Sex' feature. (Or include a binary 'female'
feature, according to your preference. Using both is unnecessary since either is the logical
negation of the other.) That is, train on these features: 'Pclass', 'SibSp', 'Parch', 'male'.
- Use pandas's df.join(pd.get_dummies())`.
- Report training accuracy as before.

In [6]:
# ... your code here ...
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/kaggle_titanic_train.csv', engine='python')
feature_names = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
df = df[feature_names]

df = df.join(pd.get_dummies(df.Sex, drop_first=True))

feature_names = ['Pclass', 'SibSp', 'Parch', 'male']
X = df[feature_names].to_numpy()
y = df['Survived'].to_numpy()

k = 3
clf = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
clf.fit(X, y)
print(f'Accuracy on training data is {clf.score(X, y):.3}')

Accuracy on training data is 0.744


### 1f. Use data imputation
to include an 'age' feature made from 'Age' but replacing each missing value with the median
of the non-missing ages. That is, train on these features: 'Pclass', 'SibSp', 'Parch', 'male',
'age'.

- Report training accuracy as before.

In [7]:
# ... your code here ...
imp = SimpleImputer(missing_values=np.nan, strategy='median', fill_value=None)
df['age'] = imp.fit_transform(df.Age.to_numpy().reshape(-1,1)) 

feature_names = ['Pclass', 'SibSp', 'Parch', 'male', 'age']
X = df[feature_names].to_numpy()
y = df['Survived'].to_numpy()

k = 3
clf = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
clf.fit(X, y)
print(f'Accuracy on training data is {clf.score(X, y):.3}')

Accuracy on training data is 0.863


## 2. Explore model fit, overfit, and regularization in the context of multiple linear regression

### 2a. Prepare the data:
- Read [http://www.stat.wisc.edu/~jgillett/451/data/mtcars.csv](http://www.stat.wisc.edu/~jgillett/451/data/mtcars.csv) into a DataFrame.
- Set a variable `X` to the subset consisting of all columns except `mpg`.
- Set a variable `y` to the `mpg` column.
- Use `train_test_split()` to split `X` and `y` into `X_train`, `X_test`, `y_train`, and `y_test`.
  - Reserve half the data for training and half for testing.
  - Use `random_state=0` to get reproducible results.

In [8]:
# ... your code here ...
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/mtcars.csv', engine='python', index_col=0)
df = df.rename({'Unnamed: 0': 'brand'}, axis='columns')

X=df.drop(columns=['mpg']).to_numpy()
y = df.mpg.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

### 2b. Train three models on the training data and evaluate each on the test data:
- `LinearRegression()`
- `Lasso()`
- `Ridge()`

The evaluation consists in displaying MSE$_\text{train}, $ MSE$_\text{test}$, and the coefficients $\mathbf{w}$ for each model.

In [9]:
# ... your code here ...
models = [linear_model.LinearRegression(), linear_model.Lasso(), linear_model.Ridge()]
df = pd.DataFrame(columns=['model', 'MSE_train', 'MSE_test', '.intercept_', '.coef_'])

for model in models:
    model.fit(X_train, y_train)
    MSE_train = (1/y_train.size) * np.sum((y_train - model.predict(X_train))**2)
    MSE_test = (1/y_test.size)  * np.sum((y_test - model.predict(X_test))**2)
    df = df.append(pd.DataFrame({'model': model, 'MSE_train': MSE_train,
                                 'MSE_test': MSE_test, '.intercept_': model.intercept_,
                                 # I'm not showing .coef_[0], always 0 here, absorbed by .intercept_
                                 '.coef_': [np.round(model.coef_[1:], 2)]}), # round for display
                   ignore_index=True)
pd.set_option('display.precision', 2)
print(df)

                model  MSE_train  MSE_test  .intercept_  \
0  LinearRegression()       0.39     30.23       -70.30   
1             Lasso()       5.69     12.99        33.80   
2             Ridge()       1.99     11.20         7.25   

                                              .coef_  
0  [0.03, 0.03, 3.13, -7.34, 3.93, -4.09, -1.22, ...  
1  [-0.04, -0.02, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, ...  
2  [-0.01, 0.0, 0.79, -3.22, 1.1, -0.48, 0.47, 1....  


  df = df.append(pd.DataFrame({'model': model, 'MSE_train': MSE_train,
  df = df.append(pd.DataFrame({'model': model, 'MSE_train': MSE_train,
  df = df.append(pd.DataFrame({'model': model, 'MSE_train': MSE_train,


### 2c. Answer a few questions about the models:
- Which one best fits the training data?
- Which one best fits the test data?
- Which one does feature selection by setting most coefficients to zero?- 

# ... your answers here in a markdown cell ...
1. Linear Regression fits the training data the best
2. Ridge Regression fits the test data the best
3. Lasso Regression did feature selection by setting most coefficients to zero