In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')
train.sample(10)

In [None]:
train['logPrice'] = np.log(train.Price)

In [None]:
sns.displot(train.logPrice)

In [None]:
lowPrice = np.quantile(train.logPrice, 0.02)
highPrice = np.quantile(train.logPrice, 0.99)
print (lowPrice, highPrice)

In [None]:
train['cappedLogPrice'] = train.logPrice
train.loc[train.logPrice < lowPrice, 'cappedLogPrice'] = lowPrice
train.loc[train.logPrice > highPrice, 'cappedLogPrice'] = highPrice
sns.histplot(train.cappedLogPrice)

In [None]:
sns.boxplot(train.cappedLogPrice)

In [None]:
train.columns

In [None]:
df_train = train.drop(columns=['ID', 'Price', 'logPrice'])
df_train.columns.to_list()

In [None]:
df_train.head()

In [None]:
cat_cols = ['Manufacturer', 'Category', 'Leather interior', 'Fuel type',
           'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']
num_cols = ['Levy', 'Prod. year', 'Engine volume', 
            'Airbags', 'Cylinders', 'Mileage']
print (len(df_train.columns.to_list()), 
        len(cat_cols),
        len(num_cols))

In [None]:
sorted(train.Doors.unique())

In [None]:
top_manufacturers = df_train.Manufacturer.value_counts().head(20).reset_index()['index'].to_list()

## Mileage

In [None]:
df_train.loc[df_train.Levy=='-', 'Levy'] = 0
df_train.Mileage = train.Mileage.apply(lambda x: int(x[:-3]))
df_train.loc[df_train.Mileage==0, 'Mileage'] = None
df_train['Mileage'] = df_train['Mileage'].\
                        fillna(df_train.groupby('Prod. year')['Mileage'].transform('median'))
df_train.Mileage.fillna(df_train.Mileage.mean(), inplace=True)
df_train.Mileage = np.log(df_train.Mileage)

In [None]:
df_train[num_cols].head()

In [None]:
df_train.Levy = df_train.Levy.astype(int)

In [None]:
df_train['Engine volume'] = df_train['Engine volume'].apply(lambda x: float(x.split(' ')[0]))
df_train['Engine volume'] = df_train['Engine volume'].astype(float)

In [None]:
df_train.loc[~df_train.Manufacturer.isin(top_manufacturers), 'Manufacturer'] = 'Other'
for col in cat_cols:
    print (col, ': ', len(df_train[col].unique()))

In [None]:
df_dummies = pd.get_dummies(df_train[cat_cols], drop_first=True)
df_dummies.tail()

# Modelling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

X = pd.concat([df_train[num_cols], df_dummies], axis=1)
y = df_train.cappedLogPrice
X_cols = X.columns.to_list()

X_train, X_test, y_train, y_test = train_test_split(X[X_cols], y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_cols)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test), columns=X_cols)

In [None]:
X_train.head()

In [None]:
plt.figure(figsize=(16, 4))
sns.boxplot(y='Mileage', x='Prod. year', data=X_train)
plt.xticks(rotation=45)
plt.show()

In [None]:
df_null = pd.DataFrame(X_train.isna().sum()).reset_index()
df_null.columns = ['Feature', 'isNull']
df_null.sort_values(by='isNull').tail()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
#rf_model = RandomForestRegressor(oob_score=True, random_state=42).fit(X_train_scaled, y_train)
#rf_model.score(X_test_scaled, y_test)                              

# Tuning Random Forest

In [None]:
#from pprint import pprint
#from sklearn.model_selection import RandomizedSearchCV
#
## Number of trees in random forest
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 100, num = 10)]
## Number of features to consider at every split
#max_features = ['auto', 'sqrt']
## Maximum number of levels in tree
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
#max_depth.append(None)
## Minimum number of samples required to split a node
#min_samples_split = [2, 5, 10]
## Minimum number of samples required at each leaf node
#min_samples_leaf = [1, 2, 4]
## Method of selecting samples for training each tree
#bootstrap = [True, False]# Create the random grid
#random_grid = {'n_estimators': n_estimators,
#               'max_features': max_features,
#               'max_depth': max_depth,
#               'min_samples_split': min_samples_split,
#               'min_samples_leaf': min_samples_leaf,
#               'bootstrap': bootstrap}
#pprint(random_grid)
#

In [None]:
#rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(), 
#                               param_distributions = random_grid, 
#                               n_iter = 100, cv = 3, verbose=2, random_state=42, 
#                               n_jobs = -1)# Fit the random search model
#
#rf_random.fit(X_train_scaled, y_train)

In [None]:
#rf_random.best_estimator_

In [None]:
#rf_random.best_estimator_.score(X_test_scaled, y_test)

In [None]:
#rf_tuned = rf_random.best_estimator_

In [None]:
rf_best = RandomForestRegressor(bootstrap=False, max_depth=80, max_features='sqrt',
                      min_samples_split=5, n_estimators=133).fit(X_train_scaled, y_train)
rf_best.score(X_test_scaled, y_test)

In [None]:
from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error(
        np.exp(y_test), 
        np.exp(rf_best.predict(X_test_scaled))))

In [None]:
df_importance = pd.DataFrame(rf_best.feature_importances_, columns = ['Importance'])
df_importance['Feature'] = X_cols
df_importance = df_importance[['Feature', 'Importance']].sort_values(by='Importance', ascending=False)
sns.barplot(x='Importance', y='Feature', data=df_importance.head(20))

---


# Light GBM

In [None]:
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from mlxtend.regressor import StackingRegressor

stack_gen = StackingRegressor(regressors=(CatBoostRegressor(verbose=0),
                                          KNeighborsRegressor(),
                                          LGBMRegressor(),
                                          SVR()),
                              meta_regressor = CatBoostRegressor(),
                              use_features_in_secondary = True)

In [None]:
stack_model = stack_gen.fit(X_train_scaled, y_train)
stack_model.score(X_test_scaled, y_test)

In [None]:
stack_model.score(X_test_scaled, y_test)

In [None]:
from catboost import Pool

train_data = Pool(X_train_scaled)

cat_model = CatBoostRegressor(verbose=0).fit(X_train_scaled, y_train)
interaction = cat_model.get_feature_importance(train_data, type="Interaction")
column_names = X_train_scaled.columns.values 
interaction = pd.DataFrame(interaction, columns=["feature1", "feature2", "importance"])
interaction.feature1 = interaction.feature1.apply(lambda l: column_names[int(l)])
interaction.feature2 = interaction.feature2.apply(lambda l: column_names[int(l)])
interaction.head(20)

In [None]:
test.head()

In [None]:
df_test = test.drop(columns = 'ID')

df_test.loc[df_test.Levy=='-', 'Levy'] = 0
df_test.loc[df_test.Doors=='04-May', 'Doors'] = '4-May'
df_test.loc[df_test.Doors=='02-Mar', 'Doors'] = '2-Mar'

df_test.Mileage = df_test.Mileage.apply(lambda x: int(x[:-3]))
df_test.loc[df_test.Mileage==0, 'Mileage'] = None
df_test['Mileage'] = df_test['Mileage'].\
                        fillna(df_test.groupby('Prod. year')['Mileage'].transform('median'))
df_test.Mileage.fillna(df_test.Mileage.mean(), inplace=True)
df_test.Mileage = np.log(df_test.Mileage)

df_test['Engine volume'].apply(lambda x: float(x.split(' ')[0]))
df_test['Engine volume'] = df_test['Engine volume'].apply(lambda x: float(x.split(' ')[0]))

df_test.Levy = df_test.Levy.astype(int)
df_test.loc[df_test.Cylinders > 15, 'Cylinders'] = 15

df_test.loc[~df_test.Manufacturer.isin(top_manufacturers), 'Manufacturer'] = 'Other'
df_dummies_test = pd.get_dummies(df_test[cat_cols], drop_first=True)

#############################################

X_submit = pd.concat([df_test[num_cols], df_dummies_test], axis=1)
X_submit_scaled = pd.DataFrame(scaler.transform(X_submit), columns=X_cols)
X_submit_scaled.head()

In [None]:
plt.figure(figsize=(8, 2))
plt.subplot(121); sns.histplot(df_train.Airbags)
plt.subplot(122); sns.histplot(df_test.Airbags)
plt.show()

In [None]:
plt.figure(figsize=(8, 2))
plt.subplot(121); sns.boxplot(X_train.Mileage)
plt.subplot(122); sns.boxplot(X_submit.Mileage)
plt.show()

In [None]:
plt.figure(figsize=(16, 2))
plt.subplot(121); sns.histplot(X_train.Levy)
plt.subplot(122); sns.histplot(X_submit.Levy)
plt.show()

In [None]:
X_submit.head()

In [None]:
plt.figure(figsize=(16,4))
for col in num_cols:
    plt.subplot(1, len(num_cols), num_cols.index(col)+1)
    sns.boxplot(df_test[col])

In [None]:
y_submit = pd.DataFrame(np.round(np.exp(rf_model.predict(X_submit_scaled)), 2) , columns=['Price'])
y_submit.to_csv('submission.csv', index=False)

In [None]:
y_submit = pd.DataFrame(np.round(np.exp(rf_tuned.predict(X_submit_scaled)), 2) , columns=['Price'])
y_submit.to_csv('submission.csv', index=False)