In [None]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor,RandomForestRegressor

In [None]:
#Import dataset
diamond = pd.read_csv("diamonds.csv")
diamond.head()
diamond.shape['index']

In [None]:
#Find missing values
missing_col = {}
def detect_missing(df,col) :
    if (df[col].isna().sum() != 0):
        missing_col[col] = df[col].isna().sum()
        
for col in diamond.columns :
    detect_missing(diamond,col)
    
print(missing_col)

In [None]:
#Find datatypes of columns
diamond.dtypes

In [None]:
#Working with non-numeric col(Clarity,color,cut) ---- One-hot encoding
diamond.clarity.value_counts()
diamond.color.value_counts()
diamond.cut.value_counts()
Onehot1 = pd.get_dummies(diamond['clarity'])
Onehot2 = pd.get_dummies(diamond['color'])
Onehot3 = pd.get_dummies(diamond['cut'])
df_diamond = pd.concat([diamond,Onehot1,Onehot2,Onehot3],axis = 1)
df_diamond.head()
df_diamond.columns

In [None]:
#Removing the insignificant columns
df_diamond = df_diamond.drop(['clarity'],axis=1)
df_diamond = df_diamond.drop(['color'],axis=1)
df_diamond = df_diamond.drop(['cut'],axis=1)
df_diamond.head()
df_diamond.columns

In [None]:
#Checking correlation between features
sns.heatmap(df_diamond.corr()

In [None]:
#Splitting the dataset into training and test sets
y = df_diamond['price']
X = df_diamond.drop(['price'],axis=1)
X.head()
X.shape
y.shape
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)
sc = StandardScaler()
scaledX_train = sc.fit_transform(X_train)
scaledX_test = sc.transform(X_test)
scaledX_train.shape
scaledX_test.shape

In [None]:
#Building model using Linear Regression
lm = LinearRegression()
lm.fit(scaledX_train,y_train)
y_pred = lm.predict(scaledX_test)
r2_score(y_test,y_pred)

In [None]:
#Building model using Polynomial Features
poly = PolynomialFeatures(degree = 2)
polyX_train = poly.fit_transform(scaledX_train)
polyX_test = poly.transform(scaledX_test)
lm.fit(polyX_train,y_train)
y_pred4 = lm.predict(polyX_test)
r2_score(y_test,y_pred4)

In [None]:
#Building final model using Decision Tree
dt = DecisionTreeRegressor(criterion = 'mse',max_depth = 13)
dt.fit(scaledX_train,y_train)
y_pred1 = dt.predict(scaledX_test)
r2_score(y_test,y_pred1)

In [None]:
#Building final model using Bagging Regressor
bg = BaggingRegressor(n_estimators = 35)
bg.fit(scaledX_train,y_train)
y_pred2 = bg.predict(scaledX_test)
r2_score(y_test,y_pred2)

In [None]:
#Building final model using Random Forest Regressor
rf = RandomForestRegressor(n_estimators = 10)
rf.fit(scaledX_train,y_train)
y_pred3 = rf.predict(scaledX_test)
r2_score(y_test,y_pred3)

In [None]:
#Choosing the best polynomial feature-based model
l1={}
for i in range(1,20):
    lm = LinearRegression()
    poly = PolynomialFeatures(degree = 2)
    polyX_train = poly.fit_transform(scaledX_train)
    polyX_test = poly.transform(scaledX_test)
    lm.fit(polyX_train,y_train)
    y_pred_temp = lm.predict(polyX_test)
    temp_score = r2_score(y_test,y_pred_temp)
    l1[i] = temp_score
    
k = max(l1.values())
for key in l1.keys():
    if(l1[key] == k):
        print(key,k)
        

In [None]:
#Choosing the best decision_tree model
l1={}
for i in range(1,100):
    model = DecisionTreeRegressor(criterion = 'mse',max_depth = i)
    model.fit(scaledX_train,y_train)
    y_pred_temp = model.predict(scaledX_test)
    temp_score = r2_score(y_test,y_pred_temp)
    l1[i] = temp_score
    
k = max(l1.values())
for key in l1.keys():
    if(l1[key] == k):
        print(key,k)
        

In [None]:
#Choosing the best bagging_regressor model
l1={}
for i in range(1,51):
    model = BaggingRegressor(n_estimators = i)
    model.fit(scaledX_train,y_train)
    y_pred_temp = model.predict(scaledX_test)
    temp_score = r2_score(y_test,y_pred_temp)
    l1[i] = temp_score
    
k = max(l1.values())
for key in l1.keys():
    if(l1[key] == k):
        print(key,k)
        

In [None]:
#Choosing the best random_forest_regressor model
l1={}
for i in range(1,51):
    rf = RandomForestRegressor(n_estimators = i)
    rf.fit(scaledX_train,y_train)
    y_pred_temo = rf.predict(scaledX_test)
    temp_score = r2_score(y_test,y_pred_temp)
    l1[i] = temp_score
    
k = max(l1.values())
for key in l1.keys():
    if(l1[key] == k):
        print(key,k)
        

In [None]:
+