<a href="https://colab.research.google.com/github/prof-rossetti/data-analytics-in-python/blob/main/units/4-predictive-analytics/Diamond_Pricing_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os

from pandas import read_csv

csv_filename = "diamonds.csv"

if not os.path.isfile(csv_filename):
    print("DOWNLOADING CSV...")
    !wget -q https://raw.githubusercontent.com/prof-rossetti/data-analytics-in-python/main/units/4-predictive-analytics/data/diamonds.csv

df = read_csv(csv_filename)
print("ROWS:", len(df))
print("COLS:", df.columns.tolist())
print(df.head())

# looks like:
#  "Diamond" is the row id
#  "Price" is the target variable
#  "Carat", "Color", "Clarity", "Cut" are features



DOWNLOADING CSV...
ROWS: 1225
COLS: ['Diamond', 'Price', 'Carat', 'Color', 'Clarity', 'Cut']
   Diamond  Price  Carat Color Clarity    Cut
0        1   6003   1.82     H      I1   Fair
1        2   3987   0.91     G     SI2  Ideal
2        3   2456   0.74     H     VS2   Good
3        4  15718   1.12     E    VVS1  Ideal
4        5   2526   0.70     D     SI1   Good


In [4]:

for col in df.columns.tolist():
    print("")
    print("------------")
    print(col.upper())
    print("------------")
    print(sorted(list(set(df[col].tolist()))))
    print(df[col].describe())
    #print(df[col].value_counts(normalize=True))

# in terms of describing the variables, looks like:
#   "Price" is continuous integer from 366 to 19,452, with a mean of 4,061
#   "Carat" is continuous decimal from 0.23 to 2.72, with a mean of 0.8
#   "Color" is categorical string, one of: ['D', 'E', 'F', 'G', 'H', 'I', 'J']
#   "Clarity" categorical string, one of: ['I1', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']
#   "Cut" is categorical string, one of: ['D', 'E', 'F', 'G', 'H', 'I', 'J']

# we should obtain or create a data dictionary to describe and explain what all these values mean

# doesn't look like there are any null values in any of the columns. that's good. so we don't need to impute any missing values


------------
COL:  DIAMOND
------------
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 2

In [40]:
import plotly.express as px
from plotly.subplots import make_subplots

carat_fig = px.scatter(df, x='Carat', y='Price', trendline="ols")
color_fig = px.histogram(df, x='Color', y='Price', histfunc="avg") #color_discrete_sequence=px.colors.colorbrewer.Pastel1
clarity_fig = px.histogram(df, x='Clarity', y='Price', histfunc="avg")
cut_fig = px.histogram(df, x='Cut', y='Price', histfunc="avg", category_orders={"Cut":["Fair","Good","Very Good", "Premium", "Ideal"]})

carat_fig.show()
color_fig.show()
clarity_fig.show()
cut_fig.show()

#fig = make_subplots(rows=4, cols=1, shared_xaxes=False)
#fig.add_trace(carat_fig['data'][0], row=1, col=1)
#fig.add_trace(color_fig['data'][0], row=2, col=1)
#fig.add_trace(clarity_fig['data'][0], row=3, col=1)
#fig.add_trace(cut_fig['data'][0],   row=4, col=1)
#fig.update_layout(height=1000, width=800, title_text="Relationship Between Features and Price")
#fig.show()




### "One-Hot" Encoding 

Before:

| cut |
|---|
 fair |
 good     |      
 very good |        
 premium   |     
 ideal |
 ideal   |


After:

cut_fair | cut_good | cut_very_good | cut_premium | cut_ideal
----|---|---|---|---
1|0|0|0|0
0|1|0|0|0
0|0|1|0|0
0|0|0|1|0
0|0|0|0|1
0|0|0|0|1



In [59]:
#!pip install category_encoders #==2.*

#from sklearn.preprocessing import OneHotEncoder
from category_encoders import  OneHotEncoder

# ENCODING
#
# we have categorical data but our models need numerical data
# so we're going to do a trick called encoding, where we convert categories to numbers
# specifically the "one-hot" encoder, which converts the original feature column to many columns (see above)
# docs: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder

example_encoder = OneHotEncoder(use_cat_names=True)

# for example
cut_encoded = example_encoder.fit_transform(df["Cut"])
print(type(cut_encoded))

print(cut_encoded)






<class 'pandas.core.frame.DataFrame'>
      Cut_Fair  Cut_Ideal  Cut_Good  Cut_Premium  Cut_Very Good
0            1          0         0            0              0
1            0          1         0            0              0
2            0          0         1            0              0
3            0          1         0            0              0
4            0          0         1            0              0
...        ...        ...       ...          ...            ...
1220         0          1         0            0              0
1221         0          0         0            0              1
1222         0          0         0            0              1
1223         0          0         1            0              0
1224         0          0         1            0              0

[1225 rows x 5 columns]



is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



In [64]:

from pprint import pprint

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#
# TEST/TRAIN SPLIT
#

df_train, df_test = train_test_split(df, test_size=0.2, random_state=99)
print("TEST TRAIN SPLIT:", len(df_train), len(df_test))

features = ["Carat","Clarity", "Color", "Cut"]
target = "Price"

x_train = df_train[features] 
y_train = df_train[target] 
print("TRAIN:", x_train.shape, y_train.shape)
print("FEATURES", x_train.columns.to_list())

x_test = df_test[features] 
y_test = df_test[target] 
#print("TEST:", x_test.shape, y_test.shape)
#print("FEATURES", x_test.columns.to_list())

#
# ENCODING
#

encoder = OneHotEncoder(use_cat_names=True)

x_train = encoder.fit_transform(x_train) # encodes all categorical vars!
print("TRAIN (ENCODED):", x_train.shape, y_train.shape)
print("FEATURES (ENCODED)", x_train.columns.to_list())

x_test = encoder.fit_transform(x_test) # need to encode test data the exact same way!
#print("TEST (ENCODED):", x_test.shape, y_test.shape) 
#print("FEATURES (ENCODED)", x_test.columns.to_list())


TEST TRAIN SPLIT: 980 245
TRAIN: (980, 4) (980,)
FEATURES ['Carat', 'Clarity', 'Color', 'Cut']
TRAIN (ENCODED): (980, 20) (980,)
FEATURES (ENCODED) ['Carat', 'Clarity_VS1', 'Clarity_SI2', 'Clarity_VS2', 'Clarity_SI1', 'Clarity_VVS2', 'Clarity_VVS1', 'Clarity_I1', 'Color_G', 'Color_F', 'Color_E', 'Color_I', 'Color_H', 'Color_D', 'Color_J', 'Cut_Ideal', 'Cut_Good', 'Cut_Premium', 'Cut_Very Good', 'Cut_Fair']



is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



In [65]:

#
# MODEL TRAINING
#

model = LinearRegression()
model.fit(x_train, y_train) 

print("COEF", model.coef_) #> [0.40214657]
print("INTERCEPT", model.intercept_) #> 65.0282004078413

#
# MODEL EVALUATION
#

print("------------------")
print("TRAINING SCORES...")
print("------------------")
y_train_pred = model.predict(x_train)
print("R^2 SCORE:", r2_score(y_train, y_train_pred))
print("MEAN ABS ERR:", mean_absolute_error(y_train, y_train_pred))
print("MEAN SQ ERR:", mean_squared_error(y_train, y_train_pred))

print("------------------")
print("TESTING SCORES...")
print("------------------")
y_test_pred = model.predict(x_test)
print("R^2 SCORE:", r2_score(y_test, y_test_pred))
print("MEAN ABS ERR:", mean_absolute_error(y_test, y_test_pred))
print("MEAN SQ ERR:", mean_squared_error(y_test, y_test_pred))


COEF [ 9.20693653e+03 -2.20491126e+16 -2.20491126e+16 -2.20491126e+16
 -2.20491126e+16 -2.20491126e+16 -2.20491126e+16 -2.20491126e+16
  4.76898348e+16  4.76898348e+16  4.76898348e+16  4.76898348e+16
  4.76898348e+16  4.76898348e+16  4.76898348e+16  3.94117931e+16
  3.94117931e+16  3.94117931e+16  3.94117931e+16  3.94117931e+16]
INTERCEPT -6.5052515177065e+16
------------------
TRAINING SCORES...
------------------
R^2 SCORE: 0.9101900377677428
MEAN ABS ERR: 878.0683673469388
MEAN SQ ERR: 1545710.3091836735
------------------
TESTING SCORES...
------------------
R^2 SCORE: 0.6859636609053118
MEAN ABS ERR: 1833.2163265306122
MEAN SQ ERR: 5034014.375510204


In [67]:

print("------------------")
print("MODEL PREDICTIONS...")
print("------------------")


pred = model.predict([x_test.iloc[0]])
print(pred) 

preds = model.predict(x_test.iloc[0:3])
print(preds) 



------------------
MODEL PREDICTIONS...
------------------
[16368.]
[16368. -1528.  7192.]


In [89]:



from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k=15) 
# use fit_transform() on train and transform() on test
x_train_selected = selector.fit_transform(x_train, y_train)
x_test_selected = selector.transform(x_test)

features = x_train.columns
print(type(features), features)
selections = selector.get_support()
print(selections)
print(features[selections])
print(features[~selections]) # this is a tilde

<class 'pandas.core.indexes.base.Index'> Index(['Carat', 'Clarity_VS1', 'Clarity_SI2', 'Clarity_VS2', 'Clarity_SI1',
       'Clarity_VVS2', 'Clarity_VVS1', 'Clarity_I1', 'Color_G', 'Color_F',
       'Color_E', 'Color_I', 'Color_H', 'Color_D', 'Color_J', 'Cut_Ideal',
       'Cut_Good', 'Cut_Premium', 'Cut_Very Good', 'Cut_Fair'],
      dtype='object')
[ True  True  True  True False  True  True  True False False  True  True
 False  True  True  True False  True  True  True]
Index(['Carat', 'Clarity_VS1', 'Clarity_SI2', 'Clarity_VS2', 'Clarity_VVS2',
       'Clarity_VVS1', 'Clarity_I1', 'Color_E', 'Color_I', 'Color_D',
       'Color_J', 'Cut_Ideal', 'Cut_Premium', 'Cut_Very Good', 'Cut_Fair'],
      dtype='object')
Index(['Clarity_SI1', 'Color_G', 'Color_F', 'Color_H', 'Cut_Good'], dtype='object')


In [90]:

#
# MODEL TRAINING
#

model = LinearRegression()
model.fit(x_train_selected, y_train) 

print("COEF", model.coef_) #> [0.40214657]
print("INTERCEPT", model.intercept_) #> 65.0282004078413

#
# MODEL EVALUATION
#

print("------------------")
print("TRAINING SCORES...")
print("------------------")
y_train_pred = model.predict(x_train_selected)
print("R^2 SCORE:", r2_score(y_train, y_train_pred))
print("MEAN ABS ERR:", mean_absolute_error(y_train, y_train_pred))
print("MEAN SQ ERR:", mean_squared_error(y_train, y_train_pred))

print("------------------")
print("TESTING SCORES...")
print("------------------")
y_test_pred = model.predict(x_test_selected)
print("R^2 SCORE:", r2_score(y_test, y_test_pred))
print("MEAN ABS ERR:", mean_absolute_error(y_test, y_test_pred))
print("MEAN SQ ERR:", mean_squared_error(y_test, y_test_pred))


COEF [ 9158.72616725  1211.41393447 -1142.68865082   727.61099433
  1495.56951737  1802.48678126 -4371.57685603   469.39944449
  -954.81175808   587.15008726 -1813.9682589    345.42650406
   205.12218284   106.90674658  -839.46845318]
INTERCEPT -3811.162686612234
------------------
TRAINING SCORES...
------------------
R^2 SCORE: 0.9061104511003697
MEAN ABS ERR: 883.5329874088328
MEAN SQ ERR: 1615923.6687290121
------------------
TESTING SCORES...
------------------
R^2 SCORE: 0.6960950270784307
MEAN ABS ERR: 1812.7149403047804
MEAN SQ ERR: 4871608.193136321


In [98]:
for k in range(1, len(x_train.columns)+1):    
    selector = SelectKBest(score_func=f_regression, k=k)
    x_train_selected = selector.fit_transform(x_train, y_train)
    x_test_selected = selector.transform(x_test)

    selections = selector.get_support()
    print("-----------------------")
    print(f"FEATURES ({k}):")
    print(features[selections].tolist())

    # TRAINING
    model = LinearRegression()
    model.fit(x_train_selected, y_train)

    # EVALUATION
    y_pred = model.predict(x_test_selected)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"TEST MAE: ${mae:,.0f}")

-----------------------
FEATURES (1):
['Carat']
TEST MAE: $1,057
-----------------------
FEATURES (2):
['Carat', 'Clarity_SI2']
TEST MAE: $1,282
-----------------------
FEATURES (3):
['Carat', 'Clarity_SI2', 'Clarity_VVS1']
TEST MAE: $1,407
-----------------------
FEATURES (4):
['Carat', 'Clarity_SI2', 'Clarity_VS2', 'Clarity_VVS1']
TEST MAE: $1,395
-----------------------
FEATURES (5):
['Carat', 'Clarity_SI2', 'Clarity_VS2', 'Clarity_VVS1', 'Color_E']
TEST MAE: $1,401
-----------------------
FEATURES (6):
['Carat', 'Clarity_SI2', 'Clarity_VS2', 'Clarity_VVS1', 'Color_E', 'Color_J']
TEST MAE: $1,367
-----------------------
FEATURES (7):
['Carat', 'Clarity_SI2', 'Clarity_VS2', 'Clarity_VVS2', 'Clarity_VVS1', 'Color_E', 'Color_J']
TEST MAE: $1,540
-----------------------
FEATURES (8):
['Carat', 'Clarity_SI2', 'Clarity_VS2', 'Clarity_VVS2', 'Clarity_VVS1', 'Color_E', 'Color_J', 'Cut_Premium']
TEST MAE: $1,550
-----------------------
FEATURES (9):
['Carat', 'Clarity_SI2', 'Clarity_VS2', 'C