In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df= pd.read_csv('./data/diamonds.csv')
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
1,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
2,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
3,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
4,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...,...
48935,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58
48936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
48937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
48938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56


In [3]:
cat_price=['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
cat_price.index('I1')

0

In [4]:
#cria função para tratar data frame nas colunas categoricas e novas colunas com log
def create_log_columns(df,param):
    #cria colunas númericas para colunas categoricas com base na ordem de maior valor
    
    #na coluna clarity é atribuiudo valor de 1 a 8 conforme posição na lista
    cat_price=['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
    df['clarity_log'] = df['clarity'].apply(lambda x: 1+ (cat_price.index(x)))
    
    #na coluna color criado atribuido valor conforme ordem alfabetica
    color_list = sorted(list(df.color.unique()),reverse=True)
    df['color_log'] = df['color'].apply(lambda x: 1+ (color_list.index(x)))
    
    #na coluna cut dado valor conforme posição na lista
    cut_price = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
    df['cut_log'] = df['cut'].apply(lambda x: 1+ (cut_price.index(x)))
    
    #criação das colunas como log conforme os valores atuais
    #como na coluna rick não temos a coluna price, foi criado parametro que define se ela sera criada ou não
    if param == 1:
        df['price_log'] = np.log(df['price'])
    #para as demais foi feito o log sendo que nas colunas que possam ter zeros é feito log(x+1)
    df['carat_log']=np.log(df['carat'])
    df['cut_log']=np.log(df['cut_log'])
    df['color_log']=np.log(df['color_log'])
    df['clarity_log']=np.log(df['clarity_log'])
    df['depth_log']=np.log(df['depth'])
    df['table_log']=np.log(df['table'])
    df['x_log']=np.log(df['x']+1)
    df['y_log']=np.log(df['y']+1)
    df['z_log']=np.log(df['z']+1)
    return df

In [5]:
#aplica a função que trata o dataframe
df = create_log_columns(df,1)
#cria uma lista com todos dados unicos na coluna clarity
df


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,clarity_log,color_log,cut_log,price_log,carat_log,depth_log,table_log,x_log,y_log,z_log
0,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1.098612,1.791759,1.386294,5.786897,-1.560648,4.091006,4.110874,1.587192,1.576915,1.196948
1,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1.609438,1.791759,0.693147,5.789960,-1.469676,4.041295,4.174387,1.619388,1.623341,1.196948
2,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,1.386294,0.693147,1.386294,5.811141,-1.237874,4.133565,4.060443,1.648659,1.654411,1.289233
3,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.693147,0.000000,0.693147,5.814131,-1.171183,4.147885,4.060443,1.675226,1.677097,1.321756
4,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48,1.791759,0.000000,1.098612,5.817111,-1.427116,4.139955,4.043051,1.597365,1.601406,1.247032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48935,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,1.098612,1.945910,1.386294,7.921898,-0.328504,4.138361,4.077537,1.900614,1.906575,1.521699
48936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1.098612,1.945910,1.609438,7.921898,-0.328504,4.107590,4.043051,1.909543,1.911023,1.504077
48937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,1.098612,1.945910,0.693147,7.921898,-0.328504,4.144721,4.007333,1.900614,1.909543,1.528228
48938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,1.098612,1.945910,1.098612,7.921898,-0.356675,4.139955,4.094345,1.896119,1.899118,1.517323


In [8]:
clar_lst = df.clarity.unique()
clar_lst


array(['SI1', 'VS1', 'VS2', 'SI2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [10]:
#cria uma lista com dataframes para cada tipo de clarity
dfs = [df.query(f'(clarity =="{x}")') for x in clar_lst]
dfs[0]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,clarity_log,color_log,cut_log,price_log,carat_log,depth_log,table_log,x_log,y_log,z_log
0,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1.098612,1.791759,1.386294,5.786897,-1.560648,4.091006,4.110874,1.587192,1.576915,1.196948
6,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53,1.098612,1.098612,1.098612,5.820083,-1.347074,4.125520,4.007333,1.623341,1.631199,1.261298
8,0.30,Good,J,SI1,64.0,55.0,339,4.25,4.28,2.73,1.098612,0.000000,0.693147,5.826000,-1.203973,4.158883,4.007333,1.658228,1.663926,1.316408
10,0.22,Premium,F,SI1,60.4,61.0,342,3.88,3.84,2.33,1.098612,1.609438,1.386294,5.834811,-1.514128,4.100989,4.110874,1.585145,1.576915,1.202972
15,0.30,Good,J,SI1,63.4,54.0,351,4.23,4.29,2.70,1.098612,0.000000,0.693147,5.860786,-1.203973,4.149464,3.988984,1.654411,1.665818,1.308333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48933,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43,1.098612,1.609438,1.386294,7.921536,-0.342490,4.091006,4.127134,1.908060,1.906575,1.488400
48935,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,1.098612,1.945910,1.386294,7.921898,-0.328504,4.138361,4.077537,1.900614,1.906575,1.521699
48936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1.098612,1.945910,1.609438,7.921898,-0.328504,4.107590,4.043051,1.909543,1.911023,1.504077
48937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,1.098612,1.945910,0.693147,7.921898,-0.328504,4.144721,4.007333,1.900614,1.909543,1.528228


In [5]:
def ger_model(df):
    #cria uma função que gera um modelo com base no dataframe inputa
    y= df['price_log']
    X= df[['carat_log','cut_log','color_log','depth_log','table_log','x_log','y_log','z_log']]
    modelo = LinearRegression()
    modelo.fit(X, y)
    return modelo

In [6]:

#para cada dataframe é gerado um modelo
modelos = [ger_model(df) for df in dfs]

In [7]:
def do_predict(row):
    #para cada linha é buscado o modelo correspondente e feito o predict
    #busca qual o clarity na linha
    
    clar = row['clarity']
   
    #define o input conforme os dados da linha
    X= [[row['carat_log'],row['cut_log'],row['color_log'],row['depth_log'],row['table_log'],row['x_log'],row['y_log'],row['z_log']]]
    
    #busca o modelo correspondente conforme a clarity
    
    index = list(clar_lst).index(clar)
    model = modelos[index] 
   
    #realiza a previsão
    y_predict = model.predict(X)
    
    
    
    
    #faz o exponencial para voltar a conversão feita no log
    y_predict=np.exp(y_predict)
    #retorna y_predict
    
    return y_predict

In [8]:
#cria um array com as previsões conforme cada linha do dataframe
y_predict = df.apply(lambda row : do_predict(row), axis=1)

#seleciona a coluna real para comparar com o previsto
y_real = df['price']

#calcula r2 do model
#metrics.r2_score(y_real, y_predict).round(3)

NameError: name 'metrics' is not defined

In [None]:
from sklearn.metrics import mean_squared_error
#calcula rmse
#np.sqrt(metrics.mean_squared_error(y_real,y_predict))

In [None]:

#busca o df do rick e faz uma cópia
rick_diamonds = pd.read_csv('./data/rick_diamonds.csv')
rick_copy = rick_diamonds.copy()

#trata a copia para criar as colunas com log
rick_copy = create_log_columns(rick_copy,0)

#faz a previsão com a função criada para cada linha
y_predict = rick_copy.apply(lambda row : do_predict(row)[0], axis=1)

#cria uma coluna no dataframe principal com preco predito
rick_diamonds['price_predicted'] = y_predict

#salva o df
rick_diamonds.to_csv('./data/rick_test.csv')