# Diamonds price prediction

*Task : Predict diamonds price*

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [None]:
#import data
data = pd.read_csv('../input/diamonds/diamonds.csv')
data.head()

# EDA (Exploratory Data Analysis)

In [None]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
data.shape

*notre Dataset contient 53940 lines et 10 colonnes*

In [None]:
data.info()

In [None]:
data.dtypes

In [None]:
data.isnull().sum()/data.shape[0]

Le dataset ne contient pas de valeurs manquantes

In [None]:
data.describe()

In [None]:
data.groupby('color')['price'].describe()

In [None]:
target = 'price'

In [None]:
data.price.hist(bins=10)

## Variables Numeriques

In [None]:
for col in data.select_dtypes(include=[np.number]):
    plt.figure(figsize=(12,8))
    sns.regplot(data=data, x = f'{col}', y=target)

*Remarque*
- nos variables numeriques contiennent beaucoup de outliers ( carat, depth,x, y, z)
- on peut voir que la target variables (price) est très correlé avec  carat, x, y,z variables

# suppression des outliers

In [None]:
def detect_outliers(data):
    data = data[data['x'] > 0]
    data = data[data['y'] < 20]
    data = data[data['z'] < 10]
    data = data[data["z"] > 1.8]
    data = data[data['table'] > 50]
    data = data[data['table'] < 70]
    data = data[data['depth'] > 54]
    data = data[data['depth'] < 70]
    data = data[data['carat'] < 3.3]
    return data

In [None]:
data = detect_outliers(data=data)

In [None]:
for col in data.select_dtypes(include=[np.number]):
    plt.figure(figsize=(12,8))
    sns.regplot(data=data, x = f'{col}', y=target)

*on peut mieux observer les differentes correlation*
- le prix du diament est proportionnel au nombre de carat, egalement à x, y, z

## Exploration supplementaire des variables numerique
  ### Discretisation des variables

### 1- carat

In [None]:
carat_ranges = pd.qcut(data.carat, 5)
plt.figure(figsize=(15, 5))
sns.barplot(x=carat_ranges.values, y=target, data=data)

### le prix du diamant est proportionnel a son nombre de carat
- plus les nombre de carat est élévés plus le prix du diamant augmente

### 2- x (y, z)

In [None]:
x_ranges = pd.qcut(data.x, 6)
plt.figure(figsize=(15, 5))
sns.barplot(x=x_ranges.values, y=target, data=data)

* tout comme le carat, le prix du diamant est aussi proportinnel à x, y, et z

### 3- Depth

In [None]:
depth_ranges = pd.qcut(data.depth, 4)
plt.figure(figsize=(15, 5))
sns.barplot(x=depth_ranges.values, y=target, data=data)

on constate que la profondeur du diamant n'a pas un impact sur son prix

## Categorical Variables

In [None]:
for col in data.select_dtypes(exclude=[np.number]):
    plt.figure(figsize=(10, 6))
    sns.countplot(data=data, x=f'{col}')

## 1- Cut

In [None]:
fig, axes = plt.subplots(1,2, figsize=(15,5), sharey=True, sharex=True)
sns.barplot(ax=axes[0], x='cut', y=target, data=data)
axes[0].set_title('cut')
sns.boxplot(ax=axes[1], x='cut', y=target, data=data)
axes[1].set_title('cut')

- on remarque la qualité de coupure n'a pas une tres grande influence sur le prix du diamant
- on remarque aussi la presence de beaucoup de outliers

## 2- Clarity

In [None]:
fig, axes = plt.subplots(1,2, figsize=(15,5), sharey=True, sharex=True)
sns.barplot(ax=axes[0], x='clarity', y=target, data=data)
sns.boxplot(ax=axes[1], x='clarity', y=target, data=data)

- 1. le prix du diamant est relativement bas lorsque clarity est à IF, VVS1, VVS2
- 2. la moyenne du prix du diamant augmente legerement pour les autres clarity
- 3. ceci ne donne pas trop d'information sur le prix du diamant

## 3- Color

In [None]:
fig, axes = plt.subplots(1,2, figsize=(15,5), sharey=True, sharex=True)
sns.barplot(ax=axes[0], x='color', y=target, data=data)
sns.boxplot(ax=axes[1], x='color', y=target, data=data)

- tout comme pour la variable clarity, cut, la variables color n'est pas assez informartif quant au prix du diamand
- on note tout de meme que le prix du diamant est un peu plus élévé lorsque la couleur les I,J ou H

## distribution de la target variables en fontion des variables catecorielles

In [None]:
def dist_plot(data):
    fig, axes = plt.subplots(1,3,figsize=(20,6), sharex=True)
    i = 0
    for col in data.select_dtypes(exclude=[np.number]):
        for item in data[col].unique():
            sns.distplot(data[data[col] == item]['price'], label=item, ax=axes[i])
        i = i + 1
        plt.legend()

In [None]:
dist_plot(data)

In [None]:
df = data.copy()

In [None]:
corr_matrix = df.corr()

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(corr_matrix, cmap='RdBu_r', annot=True, linewidths=0.5, center=0)

- on peut deja voir les intepretation faites pendant l'EDA

# features engeenring

In [None]:
def features_eng(data):
    data['volume'] = data['x'] * data['y'] * data['z']
    data['carat_per_x'] = data['carat'] / data['x']
    
    #data['caratIsHigh'] = data['carat'] == 'C5'
    data.drop(['x', 'y', 'z'], axis=1, inplace=True)
    return data

In [None]:
#df = features_eng(data=df)

In [None]:
#df.head()

## Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
def preprocessing(data):
    label_encode = LabelEncoder()
    for col in data.select_dtypes('object'):
        data[col] = label_encode.fit_transform(data[col])
    return data
def encodage(dfp):
    code = {
        'Ideal': 5,
        'Premium': 4,
        'Very Good': 3,
        'Good': 2,
        'Fair': 1
    }
    dfp.loc[:,"cut"] = dfp.loc[:,"cut"].map(code)
    dfp['color']=df['color'].map({'E':1,'D':2,'F':3,'G':4,'H':5,'I':6,'J':7})
    dfp['clarity']=df['clarity'].map({'VVS1':1,'IF':2,'VVS2':3,'VS1':4,'I1':5,'VS2':6,'SI1':7,'SI2':8})
    return dfp
def binarisation(data):
    return pd.get_dummies(data)

In [None]:
df = encodage(df)
df.head()

# Model

In [None]:
df.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, LabelBinarizer, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
#df.drop(['table', 'depth'], axis=1, inplace=True)
X = df.drop(["price"], axis=1)
y = df.price

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## fonction pour calculer le MSE et le RMSE

In [None]:
def mse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    print(f'MSE = {mse}')
    print(f'RMSE = {np.sqrt(mse)}')

## premier model avec une regression lineaire

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression

In [None]:
model1 = make_pipeline(StandardScaler(), LinearRegression())
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
mse(y_true=y_test, y_pred=y_pred)

# utilisation des methodes d'ensemble

In [None]:
print(f' Xtrain shape : {X_train.shape}')
print(f' Xtest shape : {X_test.shape}')

In [None]:
numerical_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

In [None]:
numerical_pipeline = make_pipeline(StandardScaler())
# categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))

In [None]:
transformer = make_column_transformer((numerical_pipeline, numerical_features))

In [None]:
rf_reg = RandomForestRegressor(random_state=0, max_depth=8)
xgb_reg = XGBRegressor(random_state=0)
svm = SVR()

In [None]:
models = {
    "Random Forest": rf_reg,
    "XGBRegressor": xgb_reg
}

In [None]:
for name,model in models.items():
    m = make_pipeline(transformer, model)
    N, train_score, val_score = learning_curve(m, X_train, y_train, cv=5, scoring='neg_mean_squared_error', error_score='raise')
    print(f'{name} : {np.sqrt(-1 * val_score.mean(axis=1))}')
    plt.figure(figsize=(10,8))
    plt.plot(N, np.sqrt(-1 * train_score.mean(axis=1)), label="train_score")
    plt.plot(N, np.sqrt(-1 * val_score.mean(axis=1)), label= "val_score")
    plt.legend()

le XGBRegressor est beaucoup plus promoteur

# optimisation de XGB Regressor avec GridSearchCV

In [None]:
param_grid = {
        'xgbregressor__n_estimators': [100, 300, 500, 600, 800, 1000],
        'xgbregressor__learning_rate': [0.01, 0.02, 0.05, 0.1],
        'xgbregressor__colsample_bytree': [0.4, 0.45, 0.5],
        'xgbregressor__reg_lambda': [0.6, 0.8, 0.85, 0.9]
}

grid = GridSearchCV(model, param_grid, cv=4, return_train_score=True, scoring="neg_mean_squared_error")

In [None]:
grid.fit(X_train, y_train)

### Apres Optimisation avec GridSearchCv

In [None]:
xgb_reg = XGBRegressor(random_state=0, colsample_bytree=0.4603,
                             learning_rate=0.05, max_depth=5, 
                             min_child_weight=1.7817, n_estimators=600,
                             reg_alpha=0.6640, reg_lambda=0.8571)
model = make_pipeline(transformer, PolynomialFeatures(degree=2), xgb_reg)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse(y_test, y_pred)