In [1]:
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
diamonds = sns.load_dataset('diamonds')

In [3]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat      53940 non-null float64
cut        53940 non-null object
color      53940 non-null object
clarity    53940 non-null object
depth      53940 non-null float64
table      53940 non-null float64
price      53940 non-null int64
x          53940 non-null float64
y          53940 non-null float64
z          53940 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [4]:
diamonds['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [10]:
diamonds['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [11]:
diamonds['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [12]:
X = diamonds[['cut', 'color', 'clarity', 'carat', 'depth', 'table', 'x', 'y', 'z']].values
y = diamonds['price'].values

In [13]:
X.shape

(53940, 9)

In [14]:
y.shape

(53940,)

In [15]:
X

array([['Ideal', 'E', 'SI2', ..., 3.95, 3.98, 2.43],
       ['Premium', 'E', 'SI1', ..., 3.89, 3.84, 2.31],
       ['Good', 'E', 'VS1', ..., 4.05, 4.07, 2.31],
       ...,
       ['Very Good', 'D', 'SI1', ..., 5.66, 5.68, 3.56],
       ['Premium', 'H', 'SI2', ..., 6.15, 6.12, 3.74],
       ['Ideal', 'D', 'SI2', ..., 5.83, 5.87, 3.64]], dtype=object)

In [16]:
lblEncoder = LabelEncoder()
X[:, 0] = lblEncoder.fit_transform(X[:, 0])
X[:, 1] = lblEncoder.fit_transform(X[:, 1])
X[:, 2] = lblEncoder.fit_transform(X[:, 2])

In [17]:
X

array([[2, 1, 3, ..., 3.95, 3.98, 2.43],
       [3, 1, 2, ..., 3.89, 3.84, 2.31],
       [1, 1, 4, ..., 4.05, 4.07, 2.31],
       ...,
       [4, 0, 2, ..., 5.66, 5.68, 3.56],
       [3, 4, 3, ..., 6.15, 6.12, 3.74],
       [2, 0, 3, ..., 5.83, 5.87, 3.64]], dtype=object)

In [18]:
X.shape

(53940, 9)

In [19]:
ohEncoder = OneHotEncoder(categorical_features=[0,1,2])
X = ohEncoder.fit_transform(X)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [20]:
X.shape

(53940, 26)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=100)



In [22]:
regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [23]:
regr.coef_

array([ 8.07441111e+01,  6.66104166e+02,  9.20315582e+02,  8.42934636e+02,
        8.07369288e+02,  1.29708080e+03,  1.08812633e+03,  1.03553038e+03,
        8.16248289e+02,  3.16574703e+02, -1.66804650e+02, -1.06928808e+03,
       -3.41294640e+03,  1.95134418e+03,  2.66100066e+02, -6.97323297e+02,
        1.18060619e+03,  8.65099494e+02,  1.61463444e+03,  1.54995311e+03,
        1.12473420e+04, -6.13122732e+01, -2.49860743e+01, -9.88486712e+02,
        9.80851196e+00, -7.45055124e+01])

In [24]:
regr.intercept_

3942.0057059283977

In [25]:
y_pred = regr.predict(X_test)

In [26]:
y_test.shape

(5394,)

In [27]:
y_pred.shape

(5394,)

In [28]:
mean_squared_error(y_test, y_pred)

1176142.0110231712

In [29]:
r2_score(y_test, y_pred)

0.9231608802974228