# Imports

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

train_path = '../input/brilliantdiamondsregressiontask/train.csv'
test_X_path = '../input/brilliantdiamondsregressiontask/test_X.csv'
test_y_path = '../input/brilliantdiamondsregressiontask/test_y.csv'

# Preprocessing

In [None]:
df = pd.read_csv(train_path, index_col=0)
df

Drop the URL as that is arbitrary and date_fetched because it is the same

In [None]:
df.drop(['url', 'date_fetched'], axis=1, inplace=True)

# Exploratory Plots

In [None]:
plt.hist(df['price'], bins='auto')
plt.title('Histogram of sold prices')
plt.xlabel('Price')
plt.xscale('log')
plt.ylabel('Frequency')
plt.tight_layout()

The histogram is showing a very wide tail which could imply anomalies

In [None]:
plt.boxplot(df['price'])
plt.title('Boxplot of sold prices')
plt.ylabel('Price')
plt.tight_layout()

The boxplot confirms our suspicions and tells us that there are lots of anomalies. This will inform how we scale the data later.

In [None]:
plt.figure()
plt.hist(df['carat'], bins='auto')
plt.title('Histogram of Carats')
plt.xlabel('Carat')
plt.xscale('log')
plt.ylabel('Frequency')
plt.tight_layout()

plt.figure()
plt.boxplot(df['carat'])
plt.title('Boxplot of Carats')
plt.ylabel('Carat')
plt.tight_layout()

Carats also show a wide distribution

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x=df['shape'])
plt.tight_layout()
(df['shape'].value_counts()/df['shape'].value_counts().sum())*100

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x=df['cut'])
plt.tight_layout()
(df['cut'].value_counts()/df['cut'].value_counts().sum())*100

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x=df['clarity'])
plt.tight_layout()
(df['clarity'].value_counts()/df['clarity'].value_counts().sum())*100

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x=df['color'])
plt.tight_layout()
(df['color'].value_counts()/df['color'].value_counts().sum())*100

Each categorical variable has severe class imbalances

# Modelling

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
y = df.pop('price').values

In [None]:
df.head()

In [None]:
cat_cols = df.columns[df.dtypes=='object']
oh = OneHotEncoder(sparse=False)

X = oh.fit_transform(df[cat_cols])

In [None]:
X = np.concatenate((X, df['carat'].values.reshape(-1, 1)), axis=1)

In [None]:
X.shape, y.shape

In [None]:
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse = make_scorer(rmse, greater_is_better=False)

cv = KFold(n_splits=10, shuffle=True)

## LGBMRegressor

In [None]:
%%time
from lightgbm import LGBMRegressor
model = LGBMRegressor(objective='rmse', metric='rmse')

scores = cross_val_score(
    model,
    X, y,
    scoring=rmse,
    cv=cv
)

plt.plot(scores)
plt.title(f"Cross Validation Score for {model}")
plt.xlabel("Fold")
plt.ylabel("RMSE")
plt.tight_layout()
print(f"Mean RMSE: {round(np.mean(scores*-1), 5)}")

## XGBRegressor

In [None]:
%%time
from xgboost import XGBRegressor
model = XGBRegressor()

scores = cross_val_score(
    model,
    X, y,
    scoring=rmse,
    cv=cv
)

plt.plot(scores)
plt.title(f"Cross Validation Score for {model}")
plt.xlabel("Fold")
plt.ylabel("RMSE")
plt.tight_layout()
print(f"Mean RMSE: {round(np.mean(scores*-1), 5)}")

## CatboostRegressor

In [None]:
%%time
from catboost import CatBoostRegressor
model = CatBoostRegressor(verbose=0)

scores = cross_val_score(
    model,
    X, y,
    scoring=rmse,
    cv=cv
)

plt.plot(scores)
plt.title(f"Cross Validation Score for {model}")
plt.xlabel("Fold")
plt.ylabel("RMSE")
plt.tight_layout()
print(f"Mean RMSE: {round(np.mean(scores*-1), 5)}")

# Training Best Model

In [None]:
test_X_df = pd.read_csv(test_X_path, index_col=0)
test_X_df.head()

In [None]:
test_X_df.drop(['url', 'date_fetched'], axis=1, inplace=True)

In [None]:
test_X = oh.transform(test_X_df[cat_cols])

In [None]:
test_X = np.concatenate((test_X, test_X_df['carat'].values.reshape(-1, 1)), axis=1)

In [None]:
test_y = pd.read_csv(test_y_path, index_col=0)
test_y.head()

In [None]:
test_X.shape, test_y.shape

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
i = 0
preds = 0

for train_index, test_index in cv.split(X):
    i += 1
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = CatBoostRegressor(verbose=0)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5)
    
    y_pred = model.predict(X_test)
    loss = rmse(y_test, y_pred)
    
    print(f"RMSE for Fold {i}: {round(loss, 5)}")
    
    preds += model.predict(test_X)
    
preds /= 10

In [None]:
rmse(test_y, preds)