In [23]:
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import pickle

In [24]:
df = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [25]:
df = df.iloc[:,1:]
df.head()

Unnamed: 0,carat,cut,cut_ord,color,clarity,clarity_ord,price
0,0.51,Premium,4,F,VS1,4,1749
1,2.25,Fair,1,G,I1,1,7069
2,0.7,Very Good,3,E,VS2,5,2757
3,0.47,Good,2,F,VS1,4,1243
4,0.3,Ideal,5,G,VVS1,7,789


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   carat        50000 non-null  float64
 1   cut          50000 non-null  object 
 2   cut_ord      50000 non-null  int64  
 3   color        50000 non-null  object 
 4   clarity      50000 non-null  object 
 5   clarity_ord  50000 non-null  int64  
 6   price        50000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 2.7+ MB


In [27]:
df.describe()

Unnamed: 0,carat,cut_ord,clarity_ord,price
count,50000.0,50000.0,50000.0,50000.0
mean,0.798597,3.90398,4.1267,3939.1035
std,0.474651,1.117043,1.665564,3995.879832
min,0.2,1.0,1.0,326.0
25%,0.4,3.0,3.0,948.0
50%,0.7,4.0,4.0,2402.5
75%,1.04,5.0,5.0,5331.0
max,5.01,5.0,8.0,18823.0


In [28]:
fig1 = px.scatter(
    df,
    x='carat',
    y='price',
    color='color',
    template='plotly_dark',
    category_orders={'color':['D','E','F','G','H','I','J']},
    opacity=0.5)
fig1.show()

In [29]:
fig2 = px.scatter(
    df,
    x='cut_ord',
    y='price',
    color='color',
    template='plotly_dark',
    size='carat',
    category_orders={'color':['D','E','F','G','H','I','J']},
    opacity=0.5)
fig2.show()

In [30]:
fig3 = px.scatter(
    df,
    x='clarity_ord',
    y='price',
    color='color',
    size='carat',
    template='plotly_dark',
    category_orders={'color':['D','E','F','G','H','I','J']},
    opacity=0.5)
fig3.show()

In [31]:
df.corr()['price']

carat          0.921777
cut_ord       -0.053804
clarity_ord   -0.142159
price          1.000000
Name: price, dtype: float64

In [32]:
X = df.loc[:,['carat','cut_ord','clarity_ord']]
y = df.loc[:,'price']

X_train, X_test, y_train, y_test = train_test_split(X,y)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = sqrt(mean_squared_error(y_test,y_pred))
print(score)

1333.3959953933474


In [33]:
model.score(X_train, y_train)

0.8860569004767882

In [34]:
model.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [36]:
model.coef_

array([8370.60302391,  160.27214751,  458.89486464])

In [37]:
model.intercept_

-5259.934983236607

In [35]:
pickle.dump(model, open('diamond_model.sav', 'wb'))