In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
plt.rcParams['figure.figsize']= (12,6)
plt.style.use('fivethirtyeight')
sns.set(context="notebook", palette='dark', style="whitegrid")

In [None]:
df=pd.read_csv('../input/diamonds/diamonds.csv')
df.head()

In [None]:
# shape of the data
df.shape

In [None]:
# drop Unnamed column
df.drop(df.columns[0], axis=1, inplace=True)

In [None]:
# checking data types
df.info()

# Checking null values

In [None]:
df.isnull().sum()

No Null values in the dataset

In [None]:
df.shape

In [None]:
#visualizing missing numbers
msno.matrix(df)

In [None]:
df.describe()

x, y and z minimum values are 0 which doesn't seem realistic. Let's check


In [None]:
df[(df['x']==0) | (df['y']==0) | (df['z']==0)]

Only few rows with such scenario, we will drop them

In [None]:
df=df[~((df['x']==0) | (df['y']==0) | (df['z']==0))]

confirming that these rows were removed

In [None]:
df[(df['x']==0) | (df['y']==0) | (df['z']==0)]

In [None]:
# check correlation b/w features
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(), annot=True, cmap='viridis', cbar=True)
plt.show()

Let's check the plots

In [None]:
numerical_cols=df.select_dtypes(include=np.number).columns.to_list()
categorical_cols=df.select_dtypes(exclude=np.number).columns.to_list()

In [None]:
numerical_cols

In [None]:
categorical_cols

### Checking categorical columns


**Feature "CUT" EDA**

In [None]:
sns.catplot('cut', data=df, kind='count',aspect=2.5)

In [None]:
sns.catplot(x='cut', y='price', kind='box', data=df, aspect=2.5)

Feature "color" EDA

In [None]:
sns.catplot('color', kind='count', data=df, aspect=2.5)

In [None]:
sns.catplot(x='color', y='price', data=df, aspect =2.5, kind='box')

#### Numerical columns EDA

In [None]:
numerical_cols

In [None]:
sns.pairplot(df[numerical_cols], kind='reg')

In [None]:
# Let's create a new column volume
df['volume']=df['x']*df['y']*df['z']

In [None]:
df.head()

In [None]:
df.drop(['x', 'y', 'z'], axis=1, inplace=True)

In [None]:
# Apply categorical encoding
df=pd.get_dummies(df, drop_first=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Conver to X & y
X=df.drop('price', axis=1)
y=df['price']

Now splitting the X & y into train and test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.2, random_state=1)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
sc=StandardScaler()
X_train_tx=sc.fit_transform(X_train)
X_test_tx=sc.transform(X_test)

##### Let's wrap up the dataset in a tuple so that if required we can create a new feature engineered dataset to run the models again

In [None]:
dataset_1=(X_train, X_test, y_train, y_test, 'dataset_1')

In [None]:
# Blank lists for all the details
model_=[]
cv_score_test=[]
cv_score_train=[]
mse_=[]
mae_=[]
rmse_=[]
r2_=[]



In [None]:
def run_model(model, dataset, modelname):
    model.fit(dataset[0], dataset[2])
    accuracies=cross_val_score(estimator=model, X=dataset[0], y=dataset[2], cv=5, verbose=1)
    y_pred=model.predict(dataset[1])
    print('')
    score_1=model.score(dataset[1], dataset[3])
    print(f'#### {modelname} ####')
    print("score :%.4f" %score_1)
    print(accuracies)
    
    
    mse=mean_squared_error(dataset[3], y_pred)
    mae=mean_absolute_error(dataset[3], y_pred)
    rmse=mean_squared_error(dataset[3], y_pred)**0.5
    r2=r2_score(dataset[3], y_pred)
    
    
    print('')
    print('MSE    : %0.2f ' % mse)
    print('MAE    : %0.2f ' % mae)
    print('RMSE   : %0.2f ' % rmse)
    print('R2     : %0.2f ' % r2)
    
    ## appending to the lists
    
    model_.append(modelname)
    cv_score_test.append(score_1)
    cv_score_train.append(np.mean(accuracies))
    mse_.append(mse)
    mae_.append(mae)
    rmse_.append(rmse)
    r2_.append(r2)

In [None]:
model_dict={'LinearRegression': LinearRegression(), 'LassoRegression': Lasso(normalize=True), 
             'AdaBoostRegressor': AdaBoostRegressor(n_estimators=1000),
            'RidgeRegression': Ridge(normalize=True),
            'GradientBoostingRegressor': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, verbose=1),
           'RandomForestRegressor': RandomForestRegressor(), 
           'KNeighborsRegressor': KNeighborsRegressor()
           }

In [None]:
run_model(model_dict['LinearRegression'], dataset_1, "LinearRegression")

In [None]:
for models in model_dict:
    run_model(model_dict[models], dataset_1, models)

**RandomForest has 98% accuracy. Model is giving excellent results**

In [None]:
accuracy_data=pd.DataFrame(zip(model_, cv_score_test, cv_score_train, mse_, mae_, rmse_, r2_), columns=['Model', 'CV Test score', 'CV Train score (mean)', '%%SVGean Squared error', 'Mean Absolute error', 'Root Mean Squared error', 'R2 Score'])

In [None]:
accuracy_data