In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [2]:
from cleaner import new_clean
import pandas as pd
import numpy as np

## Reading and Preparing Train Data

The modifications from Approach 2 to Approach 3, as described in the report are all handled in our cleaner function *new_clean()*

In [3]:
df_init = pd.read_csv('data/train.csv')
test_init = pd.read_csv('data/test.csv')

## Train LightGBM model

In [4]:
df = new_clean(df_init)
test = new_clean(test_init,is_test=True)

In [5]:
X = df.drop(['price'],axis=1)
Y = df.price

In [6]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [8]:
model = lgb.LGBMRegressor(
    boosting_type="gbdt",
    num_iterations = 2500,
    learning_rate = 0.05,
    num_leaves=15,
    tree_learner='feature',
    max_depth =10,
    min_data_in_leaf=7,
    bagging_fraction = 1,
    bagging_freq = 100,
    reg_sqrt='True',
    metric ='rmse',
    feature_fraction = 0.6,
    random_state=42)

model.fit(X_train,y_train) 


preds = model.predict(X_test)
rmse_lgb = mean_squared_error(y_test, preds,squared = False)
print(" RMSE: %f" % (rmse_lgb))



 RMSE: 23222.070367


## Train CatBoost model

In [9]:
df = new_clean(df_init, is_catboost=True)
test = new_clean(test_init, is_test=True, is_catboost=True)

In [10]:
X = df.drop(['price'],axis=1)
Y = df.price

In [11]:
categorical_features = []
for i,j in df.dtypes[df.dtypes == 'category'].items():
    categorical_features.append(i)

In [12]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [14]:
model = CatBoostRegressor(
        cat_features=categorical_features,
        loss_function='RMSE',
        random_strength=1,
        verbose=100,
        iterations=1000,
        l2_leaf_reg=3,
        depth=4,
        learning_rate=0.5,
        random_seed=111)

model.fit(X_train,y_train) 


preds = model.predict(X_test)
rmse_cbr = mean_squared_error(y_test, preds,squared = False)
print(" RMSE: %f" % (rmse_cbr))

0:	learn: 94097.6621829	total: 51.3ms	remaining: 51.2s
100:	learn: 15217.9810469	total: 578ms	remaining: 5.14s
200:	learn: 11164.1290875	total: 1.12s	remaining: 4.44s
300:	learn: 9174.6985334	total: 1.57s	remaining: 3.64s
400:	learn: 7909.3686450	total: 2.04s	remaining: 3.04s
500:	learn: 6903.0882772	total: 2.53s	remaining: 2.52s
600:	learn: 6219.6013234	total: 3.12s	remaining: 2.07s
700:	learn: 5727.4096548	total: 3.66s	remaining: 1.56s
800:	learn: 5313.3444763	total: 4.18s	remaining: 1.04s
900:	learn: 4971.5039324	total: 4.67s	remaining: 513ms
999:	learn: 4691.4667330	total: 5.17s	remaining: 0us
 RMSE: 21195.413294


### Predict Values for Test Data

In [15]:
test['Predicted'] = model.predict(test)

In [16]:
test['id'] = test.index

In [17]:
test.Predicted = test.Predicted.apply(lambda x:round(x / 100.0) * 100.0)

In [18]:
submission = test[['id','Predicted']]

In [19]:
submission

Unnamed: 0,id,Predicted
0,0,167500.0
1,1,275200.0
2,2,227400.0
3,3,155400.0
4,4,63600.0
...,...,...
4995,4995,73000.0
4996,4996,168700.0
4997,4997,56400.0
4998,4998,75300.0


In [20]:
submission.to_csv('data/submission.csv',index=False)