In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%notebook inline
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# reading file and checking head 

In [None]:
df = pd.read_csv("/kaggle/input/diamonds/diamonds.csv")

print(df.shape)
df.head()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.info()

# EDA

In [None]:
sns.set_style("darkgrid")

In [None]:
plt.figure(figsize=(10,8))
sns.countplot("cut", data=df)

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot("carat", "price", hue= "cut", data= df, size="price", sizes=(40,400))
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.scatterplot("clarity", "price", data= df, size="price", hue="clarity", sizes=(40,400))
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.title("Price and Clarity")
sns.barplot("clarity", "price", data=df)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot("color", "price", data=df)

In [None]:
sns.lineplot("color", "price", data=df)

In [None]:
sns.lineplot("clarity", "price", data=df)

# checking categorical data

In [None]:
df1 = df.drop("Unnamed: 0", axis = 1)

In [None]:
for col in df1.select_dtypes("object"):
    print(col,len(df1[col].unique()), df1[col].unique())
    print("")

# encoding data

cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color diamond colour, from J (worst) to D (best)

clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

In [None]:
df2 = df1.copy()

In [None]:
df2["cut"] = df2.replace({"Fair": 0, "Good": 1, "Very Good": 2, "Premium": 3, "Ideal": 4})

df2["clarity"] = df2.replace({"I1": 0, "SI2": 1, "SI1": 2, "VS2": 3, "VS1": 4, "VVS2": 5, "VVS1": 6, "IF": 7})

In [None]:
df2.head()

In [None]:
corr = df2.corr()
plt.figure(figsize=(10,7))
sns.heatmap(corr, annot = True)

# initializing models

In [None]:
models = {
    "                    Linear Regression": LinearRegression(),
    "                                Ridge": Ridge(),
    "                                Lasso": Lasso(),
    "                  K Nearest Neighbors": KNeighborsRegressor(n_neighbors=49),
    "                        Random Forest": RandomForestRegressor(max_depth=5)
}

# splitting data

In [None]:
X = df2.drop(["price", "color", "table", "depth"], axis = 1)
y = df2["price"]

sc = StandardScaler()
X = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 101)

In [None]:
error = []

for i in range(1,50):
    knn = KNeighborsRegressor(n_neighbors = i )
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    error.append(np.mean(pred != y_test))

In [None]:
plt.figure(figsize=(10,7))
plt.xlabel("K")
plt.ylabel("Error")
plt.plot(range(1,50), error, marker = "*")
plt.xticks(range(1,50,2))
plt.title("K vs Error")
plt.show()


# training 

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

# checking accuracy

In [None]:
for name, model in models.items():
    print(name)
    print("--------------------"*3)
    print("Testing Accuracy: {:.5f}".format(model.score(X_test, y_test)))
    print("Training Accuracy: {:.5f}".format(model.score(X_train, y_train)))
    print("--------------------"*3)
    print('\n')
                     

# bagging

In [None]:
bg_model = BaggingRegressor(KNeighborsRegressor(n_neighbors= 25), n_estimators=20, random_state=101, max_samples=0.5)

In [None]:
bg_model.fit(X_train, y_train)

In [None]:
bg_model.score(X_test, y_test)

In [None]:
bg_model.score(X_train, y_train)

# 2nd approach

In [None]:
df3 = df.copy()
df3.drop(['Unnamed: 0', 'color', 'table', 'cut'], axis=1, inplace=True)

# generating dummies

In [None]:
dummies = pd.get_dummies(df3)

In [None]:
dummies.head()

In [None]:
dummies.drop(dummies.iloc[:,-1], inplace=True)

# splitting data

In [None]:
X = dummies.drop("price", axis = 1)
y = dummies["price"]

sc = StandardScaler()
X = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 101)

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    print(name)
    print("--------------------"*3)
    print("Testing Accuracy: {:.5f}".format(model.score(X_test, y_test)))
    print("Training Accuracy: {:.5f}".format(model.score(X_train, y_train)))
    print("--------------------"*3)
    print('\n')
                     

# bagging

In [None]:
bg_model = BaggingRegressor(KNeighborsRegressor(n_neighbors= 25), n_estimators=30, random_state=101, max_samples=0.3)

In [None]:
bg_model.fit(X_train, y_train)

In [None]:

bg_model.score(X_test, y_test)

In [None]:
bg_model.score(X_train, y_train)