In [None]:
# Library required for failed attempt
# !pip install dirty_cat

## Question 1
### (a) Explain random forest technique.
Random forest is a machine learning technique for classification, regression, and other tasks that operates by constructing **multiple decision tress** (_with randomly sampled training data and subset of features_) during the training stage and outputting the class that is the **mode of the classes** (for _classification_ problems), or the **mean of the prediction values** (for _regression_ problems).

### (b) Which types of dataset benefit more from this technique?
1. It works better for classification problems since that is what decision tress are good at
2. It works better (than linear regression) when there is no strong linear relation between the features and the target variable
3. It works better when the dataset is large
4. It works better with high dimensional data
5. It also works well with unbalanced data sets

### \(c\) Can you use it for dimensionality reduction? If yes, how? (no coding required)

Yes. One possible approach is to generate a large number of shallow trees with small subsets of the features against the same target value. The features that frequently get high ranking are more likely to be important.

## Question 2

In [None]:
# Import libraries in a separate cell for autocompletion
import numpy as np
import pandas as pd
import scipy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
# import matplotlib.pyplot as plt

import os

In [None]:
# Read all the data to memory
raw_data = pd.read_csv('/kaggle/input/craigslistVehiclesFull.csv')

### (a) Extract Los Angeles information

In [None]:
# Drop URLs since we are not going to access them anyway in this project
df = raw_data.drop(['url', 'image_url'], axis=1)
# Also drops Vehicle Identification Number.
# Because while it is very useful in real-life, only 22.7% of the entries have it
# in our dataset (LA only). Constructing the parser and map it to corresponding fields
# can be time-consuming and considered out of scope for this midterm.
# However, it would be interesting to see if having a VIN listed will impact the price
df.vin = df.vin.isna()
# Select the rows that have `city` equals `losangeles`
df = df[df.city == 'losangeles']
# Drop columns related to geolocation, since the region is now fixed
df.drop(list(df.filter(regex='state|county|city')), axis=1, inplace=True)
# Reset the index for easier processing
df.reset_index(drop=True, inplace=True)

### (b) Reduce the number of attributes to those which are more important in price prediction (has the most variance with respect to price)

In [None]:
cor = df.dropna().corr()
cor_target = abs(cor['price'])
cor_target.sort_values(ascending=False)

It's interesting to see that longitude has a strong correlation with the price. Although they are all relatively weak correlations, based on the data above, the column `lat` and `weather` can be dropped since they are too weak.

In [None]:
# Drop the badly performed numerical columns
df.drop(['weather', 'lat'], axis=1, inplace=True)

However, the correlation values only give us information about numerical columns. Since we also want to select categorical columns, we decided to compute mutual information and perform an ANOVA test.

In [None]:
# Temoprarily drop NaN for feature selection
_df = df.dropna()
Xtrain = _df.drop(['price'], axis=1).T.to_dict().values()
Ytrain = _df['price']
# Vectorize the categorical values
dv = DictVectorizer()
dv.fit(Xtrain)

X_vec = dv.transform(Xtrain)

In [None]:
feature_scores = mutual_info_classif(X_vec, Ytrain)

for score, fname in sorted(zip(feature_scores, dv.get_feature_names()), reverse=True)[:10]:
    print(fname, score)

In [None]:
feature_scores = f_classif(X_vec, Ytrain)[0]

count = 0
for score, fname in sorted(zip(feature_scores, dv.get_feature_names()), reverse=True):
    if np.isinf(score):
        continue
    if count == 10:
        break
    print(fname, score)
    count += 1

We can see that \[odometer, long, year, condition, drive, type\] are some of the most related features based on mutual information (`vin` won't be included since it was crafted for exploration only, as mentioned before). While the `make` column also provides important information based on the ANOVA test result, it is too dirty to use directly. Therefore, we will select the above most significant columns, and preserve `make` for further processing.

In [None]:
df_lite = df[['price', 'odometer', 'long', 'year', 'condition', 'drive', 'type', 'make']]

### (c) Deal with lost data and outliners (Data cleaning)
Just to be safe, we decided to start only with rows that does not have missing values.

In [None]:
df_cleaned = df_lite.dropna()
df_cleaned = df_cleaned[df_cleaned.price > 100]
df_cleaned.reset_index(drop=True, inplace=True)

### (d) Shuffle the dataset and split it into a training set and test set (e.g. 80% training 20% test)

In [None]:
# Preprocess the categorical columns
encoded = pd.get_dummies(df_cleaned.drop(['make'], axis=1))  # drop `make` column for now
X = encoded.drop(['price'], axis=1)
y = encoded['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=569)

### (e) Use your favourite machine learning language and packages and train a decision tree to predict a used car price. Be aware of overfitting and prune your tree if needed.
Tuning the `max_depth` and `min_samples_split` should help reduce overfitting. Therefore, we can perform a grid search to find the best parameter set.

In [None]:
parameters = {
    'max_depth':range(3,20),
    'min_samples_split':range(2,20),
    'min_samples_leaf':range(1,20),
    'max_features':[None, 'sqrt']
}
reg = GridSearchCV(DecisionTreeRegressor(), parameters, cv=10, n_jobs=4)
reg.fit(X=X_train, y=y_train)
tree_model = reg.best_estimator_
print (reg.best_score_, reg.best_params_) 

### (f) Report the training performance using 10-fold cross validation.

In [None]:
means = reg.cv_results_['mean_test_score']
stds = reg.cv_results_['std_test_score']
mean, std, params = list(zip(means, stds, reg.cv_results_['params']))[reg.best_index_]
print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

### (g) Report the testing performance.

In [None]:
modelPred = tree_model.predict(X_test)
mse = mean_squared_error(y_test, modelPred)
print('MSE: ', mse)
rmse = np.sqrt(mse)
print('RMSE:', rmse)

Apparently, the performance is not great. One of the possible reasons is that we discarded the `make` and `vin` column for simplicity. Since the model of the vehicle should be a major factor when determining the price, losing such information could impact the performance greatly.

In the future, we can attempt to write a VIN parser to reliably extract vehicle information and populate the columns accordingly. We can also try to use NLP techniques to extract keywords from the `make` column to make it cleaner.

### Appendix: Failed Attempts with Similarity Encoding
In order to make use of the `make` column, we attempted to use similarity encoding to deal with the dirty categorical data. However, it seems that scikit-learn changed the function signature of `_check_X` in `OneHotEncoder` class, which is used by the `dirty_cat` library. Attempting to bypass the said function forcibly resulted in no success.

```python
clean_columns = {
    'odometer': 'numerical',
    'long': 'numerical',
    'year': 'numerical',
    'condition': 'one-hot',
    'vin': 'numerical',
    'drive': 'one-hot',
    'type': 'one-hot'
}

encoding_methods = ['one-hot', 'target', 'similarity']
dirty_column = 'make'

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from dirty_cat import SimilarityEncoder, TargetEncoder

encoders_dict = {
    'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False),
    'similarity': SimilarityEncoder(similarity='ngram'),
    'target': TargetEncoder(handle_unknown='ignore'),
    'numerical': FunctionTransformer(None, validate=True)}

def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [(enc + '_' + col, encoders_dict[enc], [col])
                    for col, enc in clean_columns.items()]
    # adding the encoded column
    transformers += [(encoding_method, encoders_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', DecisionTreeRegressor())
    ])
    return pipeline
```

```python
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeRegressor

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

all_scores = dict()

cv = KFold(n_splits=10, random_state=569, shuffle=True)
scoring = 'r2'
for method in encoding_methods:
    pipeline = make_pipeline(method)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring=scoring)
    print('{} encoding'.format(method))
    print('{} score:  mean: {:.3f}; std: {:.3f}\n'.format(
        scoring, np.mean(scores), np.std(scores)))
    all_scores[method] = scores
```