In [2]:
import os
import pandas as pd
from scipy.stats import sigmaclip
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [3]:
df = pd.read_csv("data/diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df.shape

(53940, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


# Prepare

In [6]:
NUMBER_COLS = [
    "carat",
    "depth",
    "table",
    "price",
    "x",
    "y",
    "z",
]
CATEGORY_COLS = [
    "cut",
    "color",
    "clarity",
]
CATEGORY_ENCODED_COLS = [
]

In [7]:
def format_data(data, n_c, c_c):
    for col in n_c:
        _, l, u = sigmaclip(data[col])
        vals_to_drop = data[col][(data[col] < l) | (data[col] > u)]
        data.drop(vals_to_drop.index)

    data[n_c] = MinMaxScaler().fit_transform(data[n_c])

    data[c_c] = data[c_c].apply(LabelEncoder().fit_transform)

    return data

In [8]:
x = format_data(df, NUMBER_COLS, CATEGORY_COLS)
x

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.006237,2,1,3,0.513889,0.230769,0.000000,0.367784,0.067572,0.076415
1,0.002079,3,1,2,0.466667,0.346154,0.000000,0.362197,0.065195,0.072642
2,0.006237,1,1,4,0.386111,0.423077,0.000054,0.377095,0.069100,0.072642
3,0.018711,3,5,5,0.538889,0.288462,0.000433,0.391061,0.071817,0.082704
4,0.022869,1,6,3,0.563889,0.288462,0.000487,0.404097,0.073854,0.086478
...,...,...,...,...,...,...,...,...,...,...
53935,0.108108,2,0,2,0.494444,0.269231,0.131427,0.535382,0.097793,0.110063
53936,0.108108,1,0,2,0.558333,0.230769,0.131427,0.529795,0.097623,0.113522
53937,0.103950,4,0,2,0.550000,0.326923,0.131427,0.527002,0.096435,0.111950
53938,0.137214,3,4,3,0.500000,0.288462,0.131427,0.572626,0.103905,0.117610


# Another prepare

In [13]:
import pandas as pd
df = pd.read_csv("data/diamonds.csv")
from sklearn.preprocessing import LabelEncoder
CATEGORY_COLS = [
    "cut",
    "color",
    "clarity",
]
df[CATEGORY_COLS] = df[CATEGORY_COLS].apply(LabelEncoder().fit_transform)

# Train

In [14]:
x = df.drop(["price"], axis=1)
y = df["price"]

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

models = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LinearRegression": LinearRegression(),
}
models = [
    KNeighborsRegressor(),
    LinearRegression(),
    RandomForestRegressor(),
]
trained_models = []
for m in models:
    print(f"model: {type(m).__name__}")
    trained_models.append(m.fit(x_train, y_train))

    y_train_pred = m.predict(x_train)
    y_test_pred = m.predict(x_test)

    r2_train = m.score(x_train, y_train)
    r2_test = m.score(x_test, y_test)

    print(f"model: {type(m).__name__}")
    print(f"r2 train: {r2_train}")
    print(f"r2 test: {r2_test}")

model: KNeighborsClassifier
model: KNeighborsClassifier
r2 train: 0.1983155887494041
r2 test: 0.022308738104066247
model: KNeighborsRegressor
model: KNeighborsRegressor
r2 train: 0.9645578632369484
r2 test: 0.9462747939913778
model: RandomForestRegressor
model: RandomForestRegressor
r2 train: 0.9972350298933147
r2 test: 0.9812150823304108
model: LinearRegression
model: LinearRegression
r2 train: 0.8845117910043304
r2 test: 0.8863641648837631
