In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score 

In [23]:
diamonds = sns.load_dataset('diamonds')

In [24]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat      53940 non-null float64
cut        53940 non-null object
color      53940 non-null object
clarity    53940 non-null object
depth      53940 non-null float64
table      53940 non-null float64
price      53940 non-null int64
x          53940 non-null float64
y          53940 non-null float64
z          53940 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [25]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [26]:
X = diamonds[['cut', 'color', 'clarity', 'carat', 'depth', 'table', 'x', 'y', 'z']].values
y = diamonds['price'].values

In [28]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [29]:
lblEncoder = LabelEncoder()
X[:, 0] = lblEncoder.fit_transform(X[:, 0])
X[:, 1] = lblEncoder.fit_transform(X[:, 1])
X[:, 2] = lblEncoder.fit_transform(X[:, 2])

In [30]:
ohEncoder = OneHotEncoder(categorical_features=[0,1,2])
X = ohEncoder.fit_transform(X)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=100)



---

### Linear Regression

In [32]:
regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [33]:
y_pred = regr.predict(X_test)

In [34]:
y_pred

array([2551.01342429, 7844.96941731, 1176.2342957 , ..., 3403.42697576,
       9864.87910212, 4754.59811181])

In [35]:
mean_squared_error(y_test, y_pred)

1176141.8381122155

In [36]:
r2_score(y_test, y_pred)

0.9231608915939546

---

### Lasso

In [37]:
lasso = Lasso(alpha = 1.0)
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [38]:
y_pred = lasso.predict(X_test)

In [39]:
y_pred

array([2556.60121156, 7840.40668833, 1180.26849596, ..., 3402.85090339,
       9845.86846222, 4768.25428174])

In [40]:
mean_squared_error(y_test, y_pred)

1176306.3013607431

In [41]:
r2_score(y_test, y_pred)

0.9231501469635258

---

### Ridge

In [42]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [43]:
y_pred = lasso.predict(X_test)

In [44]:
y_pred

array([2556.60121156, 7840.40668833, 1180.26849596, ..., 3402.85090339,
       9845.86846222, 4768.25428174])

In [45]:
mean_squared_error(y_test, y_pred)

1176306.3013607431

In [46]:
r2_score(y_test, y_pred)

0.9231501469635258