In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [2]:
%%capture
cd ..

In [3]:
import pandas as pd
import numpy as np

from cleaner import clean_preliminary, to_categorical_for_cols, remove_nominal_cols, clean_sim_filled_data
from predictor import fill_ml_na
import constants as const
from utils import round_to_nearest_hundred, preds_to_csv

## Reading and Preparing Train Data

The series of functions applied to the training data prepare the data as described in Approach 2 in our report. We briefly describe the use case of each funtion below: <br> 
- *clean_preliminary()*:
    - Handles each columns individually
    - Drops the unwanted rows and columns
- *to_categorical_for_cols()*:
    - Encodes *fuel_type* using dummy variables and *category* using MultiLabelBinarizer()
- *remove_nominal_cols()*:
    - Removes columns that are not needed for further analysis.

In [4]:
df = pd.read_csv(const.TRAIN_PATH)
test = pd.read_csv(const.TEST_PATH)

In [5]:
df = clean_preliminary(df)
df = to_categorical_for_cols(df)
df = remove_nominal_cols(df)

test = clean_preliminary(test, is_test=True)
test = to_categorical_for_cols(test)
test = remove_nominal_cols(test)

### Filling Null Values
We have two ways to fill the null values. The training of both these processes take time, further, the saved ML models are very heavy to be pushed on the GitHub repository. We therefore save the data after applying the necessary approach.

#### Machine Learning Based Approach
The function *fill_ml_na()* is used to train Machine learning models, to predict the missing values for each column. In practice, we use RandomizedSearchCV with 200 iterations and 5-fold cross validation. <br>
Since the trained models are heavy, we do not include them in the repository, you can run the below code cell where we have set the number of iterations as 1 and k as 2 for cross validation to allow a quick sample.

In [6]:
filled_train = fill_ml_na(df,training=True,num_iter=1,k_splits=2)
# filled_train = fill_ml_na(df)
filled_test = fill_ml_na(test)

Alternatively, you can use the below code cell to access the training data we have saved after filling null values using the ML models.

In [53]:
filled_train = pd.read_csv(const.ML_REPLACED_TRAIN)
filled_test = pd.read_csv(const.ML_REPLACED_TEST)

#### Similarity Based Approach


You can generate the similarity filled data from scratch by running the `generate_sim_df.py` file.

Alternatively, you can use the below code cell to access the training data we have saved after filling null values using the similarity approach.

In [43]:
filled_train = pd.read_csv(const.SIM_REPLACED_TRAIN)
filled_test = pd.read_csv(const.SIM_REPLACED_TEST)

filled_train = clean_sim_filled_data(filled_train)
filled_test = clean_sim_filled_data(filled_test, is_test=True)

<br> 

___Since the data is handled differently by the two regressors, please relaod the data from above after you are done using one regressor.___

## Train Model using GradientBoostingRegressor

In [32]:
filled_train.dropna(inplace=True)
filled_test.fillna(filled_test.median(),inplace=True)

In [33]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [34]:
X = filled_train.drop(['index','price'],axis=1)
Y = filled_train.price

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [36]:
model = GradientBoostingRegressor()
model.fit(X_train,y_train)
pred = model.predict(X_test)

In [37]:
mean_squared_error(y_test,pred,squared=False)

27012.532713076296

## Train LightGBM model

In [54]:
X = filled_train.drop(['index','price'],axis=1)
Y = filled_train.price

In [55]:
import lightgbm as lgb

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [57]:
model = lgb.LGBMRegressor(
    boosting_type="gbdt",
    num_iterations = 2500,
    learning_rate = 0.05,
    num_leaves=15,
    tree_learner='feature',
    max_depth =10,
    min_data_in_leaf=7,
    bagging_fraction = 1,
    bagging_freq = 100,
    reg_sqrt='True',
    metric ='rmse',
    feature_fraction = 0.6,
    random_state=42)

model.fit(X_train,y_train) 


preds = model.predict(X_test)
rmse_lgb = mean_squared_error(y_test, preds,squared = False)
print(" RMSE: %f" % (rmse_lgb))



 RMSE: 28105.328831


### Predict Values for Test Data

In [60]:
predicted_raw = model.predict(filled_test.drop(['index'], axis=1))
preds = predicted_raw.apply(round_to_nearest_hundred).to_numpy()
preds_to_csv(preds)