In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('./data/diamonds.csv')
data_base = data.copy()
data_base.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
1,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
2,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
3,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
4,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48


In [3]:
data_base.clarity.unique()

array(['SI1', 'VS1', 'VS2', 'SI2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [4]:
list_cut = ['Fair','Good','Very Good','Premium','Ideal'] 
list_cut.index('Good') # Return first index of value

1

In [5]:
def create_new_columns_log(dataframe, k):
    '''
     Creation of columns for:
        a. Categorical to Numerical: cut, color, clarity.
        b. Applying LOG
    ''' 
    # In order to LOG calculation later, the values of elements should consider *index+1* for impossible case of *LOG 0* 
    
    list_cut=['Fair','Good','Very Good','Premium','Ideal']
    list_color= ['J','I','H','G','F','E','D']
    list_clarity = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']
    
    
    dataframe['cut_log']     =  dataframe['cut'].apply(lambda x: list_cut.index(x) + 1)
    dataframe['color_log']   =  dataframe['color'].apply(lambda x: list_color.index(x) + 1)
    dataframe['clarity_log'] =  dataframe['clarity'].apply(lambda x: list_clarity.index(x) + 1)
    
    # Applying the whole additional columns to log10
    if k==1:
        dataframe['price_log'] = np.log(dataframe['price'])
    dataframe['carat_log'] =       np.log(dataframe['carat'])
    dataframe['cut_log'] =         np.log(dataframe['cut_log'])
    dataframe['color_log'] =       np.log(dataframe['color_log'])
    dataframe['clarity_log'] =     np.log(dataframe['clarity_log'])
    dataframe['depth_log'] =       np.log(dataframe['depth'])
    dataframe['table_log'] =       np.log(dataframe['table'])
    dataframe['x_log'] =           np.log(dataframe['x']+1)   # considering value = 0, apply +1 for log
    dataframe['y_log'] =           np.log(dataframe['y']+1)   # considering value = 0, apply +1 for log
    dataframe['z_log'] =           np.log(dataframe['z']+1)   # considering value = 0, apply +1 for log
  
    return dataframe

In [6]:
diamonds = create_new_columns_log(data_base,1)
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_log,color_log,clarity_log,price_log,carat_log,depth_log,table_log,x_log,y_log,z_log
0,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1.386294,1.791759,1.098612,5.786897,-1.560648,4.091006,4.110874,1.587192,1.576915,1.196948
1,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.693147,1.791759,1.609438,5.78996,-1.469676,4.041295,4.174387,1.619388,1.623341,1.196948
2,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,1.386294,0.693147,1.386294,5.811141,-1.237874,4.133565,4.060443,1.648659,1.654411,1.289233
3,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.693147,0.0,0.693147,5.814131,-1.171183,4.147885,4.060443,1.675226,1.677097,1.321756
4,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48,1.098612,0.0,1.791759,5.817111,-1.427116,4.139955,4.043051,1.597365,1.601406,1.247032


In [7]:
# Creating Model based on Clarity subgroups

In [8]:
# unique values and its indexes in order
list_clarity = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF'] 

In [9]:
# Creating the list fo dataframe grouped by each clarity
list_dfs = [diamonds.query(f'(clarity =="{clarity}")') for clarity in list_clarity]

# lendth of the list of dataframes (matched with Qty calrity)
len(list_dfs), len(list_clarity)

(8, 8)

In [10]:
# First element of list --> dataframe which contains 'I1'in clarity columns
list_dfs[0] # note that 

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_log,color_log,clarity_log,price_log,carat_log,depth_log,table_log,x_log,y_log,z_log
13,0.32,Premium,E,I1,60.9,58.0,345,4.38,4.42,2.68,1.386294,1.791759,0.0,5.843544,-1.139434,4.109233,4.060443,1.682688,1.690096,1.302913
162,1.17,Very Good,J,I1,60.2,61.0,2774,6.83,6.90,4.13,1.098612,0.000000,0.0,7.928046,0.157004,4.097672,4.110874,2.057963,2.066863,1.635106
203,1.01,Premium,F,I1,61.8,60.0,2781,6.39,6.36,3.94,1.386294,1.609438,0.0,7.930566,0.009950,4.123903,4.094345,2.000128,1.996060,1.597365
227,1.01,Fair,E,I1,64.5,58.0,2788,6.29,6.21,4.03,0.000000,1.791759,0.0,7.933080,0.009950,4.166665,4.060443,1.986504,1.975469,1.615420
294,0.96,Ideal,F,I1,60.7,55.0,2801,6.37,6.41,3.88,1.609438,1.609438,0.0,7.937732,-0.040822,4.105944,4.007333,1.997418,2.002830,1.585145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48625,1.22,Premium,G,I1,59.2,60.0,2699,6.97,6.90,4.10,1.386294,1.386294,0.0,7.900637,0.198851,4.080922,4.094345,2.075684,2.066863,1.629241
48671,1.05,Very Good,J,I1,59.6,63.0,2705,6.61,6.55,3.92,1.098612,0.000000,0.0,7.902857,0.048790,4.087656,4.143135,2.029463,2.021548,1.593309
48761,1.00,Fair,G,I1,61.7,57.0,2723,6.39,6.28,3.91,0.000000,1.386294,0.0,7.909489,0.000000,4.122284,4.043051,2.000128,1.985131,1.591274
48880,1.04,Very Good,I,I1,61.6,61.0,2745,6.45,6.47,3.98,1.098612,0.693147,0.0,7.917536,0.039221,4.120662,4.110874,2.008214,2.010895,1.605430


In [11]:
def create_model(dataframe):
    '''
    Creating model for each dataframe input 
    '''
    X= dataframe[['carat_log','cut_log','color_log','depth_log','table_log','x_log','y_log','z_log']]
    y= dataframe['price_log']
    
    model = LinearRegression()
    model.fit(X, y)
    
    return model

In [12]:
# Creating 8 models of each types of clarity 
each_model = [create_model(df) for df in list_dfs]
each_model

[LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)]

In [23]:
create_model(data_base).coef_

array([5.53882204, 1.16061746, 1.26239959, 0.49615981, 0.69391387,
       0.67985036, 1.39415559, 1.22000463])

In [25]:
create_model(data_base).intercept_

12.118796633087062

In [15]:
def make_predict(row):
        
    #define o input conforme os dados da linha
    X= [[row['carat_log'],row['cut_log'],row['color_log'],row['depth_log'],row['table_log'],row['x_log'],row['y_log'],row['z_log']]]
    
    #busca o modelo correspondente conforme a clarity
    index = list_clarity.index(row['clarity'])
    model = each_model[index] 
   
    
    y_predict = model.predict(X)
    
    
    
    #faz o exponencial para voltar a conversão feita no log
    y_predict=np.exp(y_predict)
    
    
    return y_predict

In [16]:
y_predict = diamonds.apply(lambda row : make_predict(row)[0], axis=1)
y_predict

0         269.331604
1         369.305197
2         410.681474
3         331.049042
4         246.604549
            ...     
48935    2757.891198
48936    2816.749188
48937    2637.473665
48938    2572.489880
48939    2560.467980
Length: 48940, dtype: float64

In [17]:
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_log,color_log,clarity_log,price_log,carat_log,depth_log,table_log,x_log,y_log,z_log
0,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1.386294,1.791759,1.098612,5.786897,-1.560648,4.091006,4.110874,1.587192,1.576915,1.196948
1,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.693147,1.791759,1.609438,5.789960,-1.469676,4.041295,4.174387,1.619388,1.623341,1.196948
2,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,1.386294,0.693147,1.386294,5.811141,-1.237874,4.133565,4.060443,1.648659,1.654411,1.289233
3,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.693147,0.000000,0.693147,5.814131,-1.171183,4.147885,4.060443,1.675226,1.677097,1.321756
4,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48,1.098612,0.000000,1.791759,5.817111,-1.427116,4.139955,4.043051,1.597365,1.601406,1.247032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48935,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,1.386294,1.945910,1.098612,7.921898,-0.328504,4.138361,4.077537,1.900614,1.906575,1.521699
48936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1.609438,1.945910,1.098612,7.921898,-0.328504,4.107590,4.043051,1.909543,1.911023,1.504077
48937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,0.693147,1.945910,1.098612,7.921898,-0.328504,4.144721,4.007333,1.900614,1.909543,1.528228
48938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,1.098612,1.945910,1.098612,7.921898,-0.356675,4.139955,4.094345,1.896119,1.899118,1.517323


In [18]:
y = diamonds['price']

In [19]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y, y_predict)
rmse = np.sqrt(mse)
rmse

740.9133966289618

In [20]:
rick_diamonds = pd.read_csv('./data/rick_diamonds.csv')
diam_rick = rick_diamonds.copy()


In [21]:

# Creating new adicional columns of log
diam_rick = create_new_columns_log(diam_rick, 0) # 0 for no price column

#faz a previsão com a função criada para cada linha
y_predict = diam_rick.apply(lambda row : make_predict(row)[0], axis=1)
y_predict

0        3018.650992
1         850.358886
2        1463.264087
3        7044.695870
4       18024.236560
            ...     
4995      450.635509
4996     1884.278552
4997     4671.502204
4998      843.357430
4999      755.856967
Length: 5000, dtype: float64

In [22]:
#cria uma coluna no dataframe principal com preco predito
rick_diamonds['price_predicted'] = y_predict

#salva o df
rick_diamonds.to_csv('./data/rick_diam_log.csv')