In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from pathlib import Path
%matplotlib inline 
sns.set(color_codes=True)
enc = LabelEncoder()
model = LinearRegression()



In [None]:
# import dataset
#%cd "D:\Masters\SEM2\AML\assignment\archive" 
# enable if the csv is in a different path than the notebook
df = pd.read_csv('../input/diamonds/diamonds.csv')
df.head(5)

In [None]:
# check for null values
df.isnull().sum()

In [None]:
# check data types
df.info()

In [None]:
# check values of x, y, z
df.describe()

In [None]:
# convert categorical data into numeric
df["cut"] = df["cut"].astype('category')
df["ccut"] = df["cut"].cat.codes

df["color"] = df["color"].astype('category')
df["ccolor"] = df["color"].cat.codes

df["clarity"] = df["clarity"].astype('category')
df["cclarity"] = df["clarity"].cat.codes



# drop unwanted columns
df.drop("Unnamed: 0", axis=1, inplace=True)
df.drop("depth", axis=1, inplace=True)
df.drop("table", axis=1, inplace=True)
df.drop("cut", axis=1, inplace=True)
df.drop("color", axis=1, inplace=True)
df.drop("clarity", axis=1, inplace=True)
df.drop("x", axis=1, inplace=True)
df.drop("y", axis=1, inplace=True)
df.drop("z", axis=1, inplace=True)

# rename headers
df.rename(columns = {"ccut": "cut", "ccolor": "color", "cclarity": "clarity"}, inplace = True)

# rearrange columns
col_cut = df.pop("cut")
df.insert(1, col_cut.name, col_cut)

col_color = df.pop("color")
df.insert(2, col_color.name, col_color)

col_clarity = df.pop("clarity")
df.insert(3, col_clarity.name, col_clarity)

df.head(5)

In [None]:
# plot histogram
df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# plot pairplot
sns.pairplot(df, diag_kws={'bins': 10})

In [None]:
# plot heatmap
plt.figure(figsize=(20,10))
c= df.corr()
sns.heatmap(c,cmap="BrBG",annot=True)

In [None]:
# Extract x as all the columns except the last column
x = df[["carat","cut","color","clarity"]]
x.head(5)

In [None]:
# extract y as last column
y = df.iloc[:,4:]
y.head(5)

In [None]:
# split the data into train and test set

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
kfold = KFold(n_splits=10, random_state=66, shuffle=True)
accuracies = cross_val_score(estimator = model, X = x_train, y = y_train, cv = kfold)

In [None]:
# fit
model.fit(x_train, y_train)

In [None]:
# predict
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

#score
model.score(x_test,y_test)

In [None]:
# accuracy
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2)) 
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2)) 
print("Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2)) 
print("Explain variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2)) 
print("R2 score =", round(sm.r2_score(y_test, y_test_pred), 2))

In [None]:
# visualize the training & test data
sns.distplot(y_test_pred,hist=False,color="g",label="Predictions")
sns.distplot(y_test,hist=False,color="r",label="actual")
plt.show()