# Imports

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.express as px
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import LocalOutlierFactor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances

%matplotlib inline

In [None]:
plt.style.use('ggplot')
plt.rcParams['figure.figsize']=(12,8)

# Data Summary

In [None]:
data=pd.read_csv("/kaggle/input/playground-series-s3e8/train.csv")
data.head()

In [None]:
data.describe().T

In [None]:
data.info()

In [None]:
data.isnull().any()

In [None]:
data.drop(columns=['id'],inplace=True)
cols=['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y','z']
d=data.duplicated(subset=cols,keep=False)

#DataFrame without duplicate values
d1=data.iloc[d[d==0].index,:]

#Dataframe containing duplicate values
'''All the feature are same but price are different,therefore
   we use mean price to remove duplicates'''

d2=data.iloc[d[d==1].index,:].groupby(cols)["price"].mean().reset_index()

#Concatenating dataframes
data=pd.concat([d1,d2],axis=0).reset_index(drop=True)

In [None]:
data.tail()

# Data Exploration

### Numerical Plots

In [None]:
fig = px.histogram(data, x="price")
py.iplot(fig)

In [None]:
sns.pairplot(data[['carat', 'x', 'y','z', 'depth', 'table', 'price']]);

In [None]:
fig = px.scatter(data, x="carat", y="price")
py.iplot(fig)

In [None]:
fig = px.scatter(data, x="x", y="price")
py.iplot(fig)

**Target Variable (price) follows a non-gaussian distribution, we will transform it.
Table variable can be grouped and converted in categorical variable.Others variable have trough-crust distribution so transformation may not be effective**

**Carrat, X ,Y and Z dimension are strongly correlated with each other and target variable price**

### Categorical Plots

In [None]:
df=data["cut"].value_counts()
fig = px.pie(values=df.values, names=df.index, title='Diamond cut distribution')
py.iplot(fig)

In [None]:
df=data.groupby("cut")["price"].mean().reset_index()
fig = px.bar(df, x="cut", y="price")
py.iplot(fig)

In [None]:
fig = px.box(data, x="cut", y="price")
py.iplot(fig)

In [None]:
df=data["color"].value_counts()
fig = px.pie(values=df.values, names=df.index, title='Diamond color distribution')
py.iplot(fig)

In [None]:
df=data.groupby("color")["price"].mean().reset_index()
fig = px.bar(df, x="color", y="price")
py.iplot(fig)

In [None]:
fig = px.box(data, x="color", y="price")
py.iplot(fig)

In [None]:
df=data["clarity"].value_counts()
fig = px.pie(values=df.values, names=df.index, title='Diamond clarity distribution')
py.iplot(fig)

In [None]:
df=data.groupby("clarity")["price"].mean().reset_index()
fig = px.bar(df, x="clarity", y="price")
py.iplot(fig)

In [None]:
fig = px.box(data,x="clarity", y="price")
py.iplot(fig)

**Categorical Variable barplots (different means for different categories) points to the fact that they influence gemstone prices, but their standard deviations also shows that gemstone price is based on complex relationship of multiple factors.**

# Feature Engineering

In [None]:
data["area"]=data["x"]*data["y"]
data["volume"]=data["x"]*data["y"]*data["z"]
data["perimeter"]=2*(data["x"]+data["y"])

# Data Preprocessing

In [None]:
X=data.drop(['price'],axis=1)
y=data['price'].to_numpy().reshape(-1,1)

numeric_col=X.select_dtypes(exclude="object").columns
cat_col=X.select_dtypes(include="object").columns

print("Numeric columns:\n{}".format(numeric_col))
print("categorical columns:\n{}".format(cat_col))

In [None]:
target=PowerTransformer(method='box-cox')
y_=target.fit_transform(y).flatten()

In [None]:
encoder=OneHotEncoder(handle_unknown='ignore')
cat_df=X[cat_col]
cat_df=encoder.fit_transform(cat_df)

transform=PowerTransformer()
num_df=X[numeric_col]
num_df=transform.fit_transform(num_df)

X=np.concatenate((num_df,cat_df.toarray()),axis=1)

# Algorithm Spot Checking

**Algorithm spot checking is evaluating mutliple model on data with minimal hyperparameter tuning and selecting model that performed best, then performing hyperparameter tuning on those models.**

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

In [None]:
def func(y,ypred):
    return mean_squared_error(y,ypred)

def evaluate(model):
    scoring=make_scorer(score_func=func)
    scores=cross_val_score(model,X,y_,scoring=scoring)
    return scores

In [None]:
def get_models():
    names=[]
    models=[]
    
    names.append("Ridge")
    models.append(Ridge())
    
    names.append("RandomForestRegressor")
    models.append(RandomForestRegressor())
    
    names.append("XGBRegressor")
    models.append(XGBRegressor())
    
    names.append("LGBMRegressor")
    models.append(LGBMRegressor())
    
    return names,models

In [None]:
names,models=get_models()
results=[]

for i in range(len(models)):
    results.append(evaluate(model=models[i]))

In [None]:
plt.boxplot(results,labels=names,showmeans=True);

In [None]:
print("Algorithm Spot Checking Done")

**We will LightGBM as our model as it has least root mean squared error while is significantly faster than XGB model.**

# Feature Selection

### Feature Importance

In [None]:
model=LGBMRegressor()
model.fit(X,y_)
model.feature_importances_

sns.barplot(x=list(range(X.shape[1])),y=model.feature_importances_,color="red");

In [None]:
f_imp=dict(zip(list(range(X.shape[1])),model.feature_importances_))
f_sel=sorted(f_imp.items(),key=lambda v:v[1],reverse=True)

In [None]:
results=[]
for i in range(18,29):
    scoring=make_scorer(score_func=func)
    fs=[k[0] for k in f_sel[:i]]
    scores=cross_val_score(LGBMRegressor(),X[:,fs],y_,scoring=scoring)
    results.append(scores)

In [None]:
x=range(18,29)
sns.lineplot(x=x,y=np.mean(results,axis=1))
sns.lineplot(x=x,y=np.mean(results,axis=1)-np.std(results,axis=1),color="black")
sns.lineplot(x=x,y=np.mean(results,axis=1)+np.std(results,axis=1),color="black")

In [None]:
print("Feature Selection Done")

**Root mean squared error get stagnated at 25 features,therfore we will use 25 features for training our model.**

# Hyperparameter Optimization

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y_, test_size=0.1, random_state=42)

In [None]:
def objective(trial):
    
    params={ "n_estimators": trial.suggest_int("n_estimators", 100, 400),
             "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-0, log=True),
             "reg_lambda": trial.suggest_float("reg_lambda",0.0,1.0),
             "reg_alpha": trial.suggest_float("reg_alpha",0.0,1.0)}

    model=LGBMRegressor(random_state=42,**params)
    model.fit(X_train,y_train)

    y_pred=model.predict(X_val)
    error=mean_squared_error(y_val,y_pred)
    return error

In [None]:
study = optuna.create_study(study_name="Hyperparameter optimization",direction="minimize",
                            sampler=optuna.samplers.TPESampler(seed=42),
                            pruner=optuna.pruners.MedianPruner(n_warmup_steps=10))
study.optimize(objective, n_trials=50,show_progress_bar=True)

In [None]:
print (f"Best value: {study.best_trial.value}")
print (f"Best hyperparameters:\n {json.dumps(study.best_trial.params, indent=2)}")

# Hyperparameter Visualization

In [None]:
plot_optimization_history(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
plot_param_importances(study)

# Final Model

**After using validation set to determine the best hyperparameters for the model, we train the model with optimal hyperparameter on entire dataset(train+val) for better model**

In [None]:
params=study.best_trial.params
model=LGBMRegressor(random_state=42,**params)
model.fit(X,y_)

# Submission

In [None]:
data=pd.read_csv("/kaggle/input/playground-series-s3e8/test.csv")
id=data["id"]
X=data.drop(["id"],axis=1)

X["area"]=X["x"]*X["y"]
X["volume"]=X["x"]*X["y"]*X["z"]
X["perimeter"]=2*(X["x"]+X["y"])

numeric_col=X.select_dtypes(exclude="object").columns
cat_col=X.select_dtypes(include="object").columns

In [None]:
cat_df=X[cat_col]
cat_df=encoder.transform(cat_df)

num_df=X[numeric_col]
num_df=transform.transform(num_df)

X_test=np.concatenate((num_df,cat_df.toarray()),axis=1)

In [None]:
y_=model.predict(X_test)
y_pred=y_=target.inverse_transform(y_.reshape(-1,1)).flatten()

In [None]:
submission=pd.DataFrame({"id":id,
                        "price":y_pred})

In [None]:
submission.to_csv("submission.csv",index=False)