In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/diamonds/diamonds.csv', index_col=0)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['cut'].unique()

In [None]:
df['color'].unique()

In [None]:
df['clarity'].unique()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.distplot(df['carat'], kde=False)

In [None]:
sns.distplot(df['depth'],kde=False)

In [None]:
sns.distplot(df['table'], kde=False)

In [None]:
sns.distplot(df['price'], kde=False)

In [None]:
sns.distplot(df['x'], kde=False)

In [None]:
sns.distplot(df['y'], kde=False)

In [None]:
sns.distplot(df['z'], kde=False)

In [None]:
sns.boxplot(df['carat'])

In [None]:
sns.boxplot(df['depth'])

In [None]:
sns.boxplot(df['table'])

In [None]:
sns.boxplot(df['price'])

In [None]:
sns.boxplot(df['x'])

In [None]:
sns.boxplot(df['y'])

In [None]:
sns.boxplot(df['z'])

In [None]:
statistik = df.describe(include='all')
statistik

In [None]:
df['x'][df['x']==0].count()

In [None]:
df['y'][df['y']==0].count()

In [None]:
df['z'][df['z']==0].count()

In [None]:
df['x'] = df['x'].replace(0,np.nan)
df['y'] = df['y'].replace(0,np.nan)
df['z'] = df['z'].replace(0,np.nan)

df['x'] = df['x'].fillna(df['x'].mean())
df['y'] = df['y'].fillna(df['y'].mean())
df['z'] = df['z'].fillna(df['z'].mean())

In [None]:
statistik = df.describe(include='all')
statistik

In [None]:
plt.figure(figsize=(7,6))
correlation = df.corr()
sns.heatmap(correlation, annot=True)
plt.show()

In [None]:
df.drop(['depth', 'table'], axis=1, inplace=True)
df.head()

In [None]:
cut = {'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}
df['cut'] = df['cut'].map(cut)

In [None]:
clarity = {'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4, 'VVS2':5, 'VVS1':6, 'IF':7}
df['clarity'] = df['clarity'].map(clarity)

In [None]:
color = {'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6}
df['color'] = df['color'].map(color)
df.head()

In [None]:
X = df.drop('price', axis=1)
X.head()

In [None]:
y = df['price']
y.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X))
X_scaled.columns = X.columns
X_scaled.index = X.index
X_scaled.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y)

In [None]:
rmse_train_tree = []
rmse_test_tree = []
max_depth = np.arange(1,21)

for i in max_depth:
    dtr = DecisionTreeRegressor(max_depth=i)
    dtr.fit(X_train, y_train)
    predict_train_tree = dtr.predict(X_train)
    predict_test_tree = dtr.predict(X_test)

    train = np.sqrt(mean_squared_error(predict_train_tree,y_train))
    test = np.sqrt(mean_squared_error(predict_test_tree,y_test))
    
    rmse_train_tree.append(train)
    rmse_test_tree.append(test)
    
plt.scatter(max_depth, rmse_train_tree, label='RMSE train', marker='+')
plt.scatter(max_depth, rmse_test_tree, label='RMSE test', marker='+')
plt.legend()
plt.title('RMSE vs max_depth')
plt.xlabel('max_depth')
plt.ylabel('RMSE')
plt.xticks(np.arange(1,21,step=2))
plt.show()

In [None]:
rmse_train_knr = []
rmse_test_knr = []
neighbors = np.arange(1,51)

for i in neighbors:
    knr = KNeighborsRegressor(n_neighbors=i)
    knr.fit(X_train, y_train)
    predict_train_knr = knr.predict(X_train)
    predict_test_knr = knr.predict(X_test)

    train = np.sqrt(mean_squared_error(predict_train_knr,y_train))
    test = np.sqrt(mean_squared_error(predict_test_knr,y_test))
    
    rmse_train_knr.append(train)
    rmse_test_knr.append(test)

plt.scatter(neighbors, rmse_train_knr, label='RMSE train', marker='+')
plt.scatter(neighbors, rmse_test_knr, label='RMSE test', marker='+')
plt.legend()
plt.title('RMSE vs n_neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('RMSE')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

tree = DecisionTreeRegressor()
max_depth = np.arange(1,21)
param_grid = [{'max_depth':max_depth, 'splitter':['best', 'random']}]
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train,y_train)

In [None]:
best_tree_params = grid_search.best_params_
best_tree_params

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
from sklearn.model_selection import GridSearchCV

knr = KNeighborsRegressor()
neighbors = np.arange(1,36)
param_grid = [{'n_neighbors':neighbors, 'weights':['uniform', 'distance']}]
grid_search = GridSearchCV(knr, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train,y_train)

In [None]:
best_knr_params = grid_search.best_params_
best_knr_params

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
knr = KNeighborsRegressor(**best_knr_params)
knr.fit(X_train, y_train)
predict_train_knr = knr.predict(X_train)
predict_test_knr = knr.predict(X_test)

print('RMSE train: ', np.sqrt(mean_squared_error(predict_train_knr,y_train)))
print('RMSE test: ', np.sqrt(mean_squared_error(predict_test_knr,y_test)))

In [None]:
print('R-Square train: ', r2_score(y_train, predict_train_knr))
print('R-Square test: ', r2_score(y_test, predict_test_knr))

In [None]:
summary = pd.DataFrame({'y_test':y_test, 'knr':predict_test_knr})
summary.head(15)

In [None]:
plt.title('Price Prediction Distribution')
sns.kdeplot(summary['knr'])
sns.kdeplot(summary['y_test'])
plt.show()