In [1]:
import pandas as pd

diamonds_df = pd.read_csv("input/train.csv")

# Análisis preliminar

In [2]:
# Cuantas filas y columnas tiene el dataframe
print(diamonds_df.shape)

(40455, 11)


In [3]:
# Mostramos las primeras entradas
display(diamonds_df.head())

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.12,Premium,H,SI1,61.6,59.0,6.67,6.63,4.1,5363
1,1,1.14,Very Good,E,SI2,60.0,54.0,6.74,6.97,4.11,5593
2,2,0.9,Very Good,D,SI2,60.3,63.0,6.12,6.22,3.72,3534
3,3,0.71,Ideal,E,VS1,61.9,54.0,5.74,5.76,3.56,3212
4,4,0.34,Very Good,F,SI2,60.0,62.0,4.51,4.55,2.72,447


In [4]:
# ¿Hay nulos? Parece que no
diamonds_df.isnull().sum(axis = 0)

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [5]:
# Que tipo de datos tiene cada columna
diamonds_df.dtypes

id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [6]:
# Las columnas "cut", "color" y "clarity" tienen datos no numéricos. ¿Cuantos valores distintos hay en cada uno?
print("Columna 'cut':")
print(diamonds_df["cut"].value_counts())

print("\nColumna 'color':")
print(diamonds_df["color"].value_counts())

print("\nColumna 'clarity':")
print(diamonds_df["clarity"].value_counts())


Columna 'cut':
Ideal        16250
Premium      10342
Very Good     9026
Good          3639
Fair          1198
Name: cut, dtype: int64

Columna 'color':
G    8565
E    7344
F    7162
H    6147
D    5063
I    4052
J    2122
Name: color, dtype: int64

Columna 'clarity':
SI1     9806
VS2     9204
SI2     6865
VS1     6099
VVS2    3778
VVS1    2776
IF      1365
I1       562
Name: clarity, dtype: int64


# Definir Features y Groundtruth

In [7]:
# Features:
print(len(diamonds_df["id"]))

#   Descarto el campo "id" que es la identificación única para cada diamante
columns = [column for column in diamonds_df.columns if column != "price" and column != "id"]
X = diamonds_df[columns]
print(X.shape)

# Groundtruth
y = diamonds_df["price"]
print(len(y))

40455
(40455, 9)
40455


In [8]:
# Voy a transformar las columnas "cut", "color" y "clarity" en numéricas, usando LabelEncoder.
from sklearn.preprocessing import LabelEncoder

# Hago una copia del dataset original
X_label_encoded = X.copy()

label_encoder = LabelEncoder()

object_cols = ["cut", "color", "clarity"]

for col in object_cols:
    X_label_encoded[col] = label_encoder.fit_transform(X[col])

# Las columnas "cut", "color" y "clarity" ya son numéricas:
X_label_encoded.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.12,3,4,2,61.6,59.0,6.67,6.63,4.1
1,1.14,4,1,3,60.0,54.0,6.74,6.97,4.11
2,0.9,4,0,3,60.3,63.0,6.12,6.22,3.72
3,0.71,2,1,4,61.9,54.0,5.74,5.76,3.56
4,0.34,4,2,3,60.0,62.0,4.51,4.55,2.72


In [9]:
# En el siguiente paso voy a normalizar los valores escalándolos
# para que la media de todas las columnas sea 0 y la desviación 
# típica sea 1.
#
# Voy a hacer esto usando una pipeline.

from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline

pipeline = [
    StandardScaler(),
    #Normalizer()
] 

transformer = make_pipeline(*pipeline)

X_normalized = transformer.fit_transform(X_label_encoded)

X_normalized = pd.DataFrame(X_normalized, columns = X.columns)
display(X_normalized.head())

# Tras la transformación, vemos que  la media y standar deviation de 
# cada una de las columnas es prácticamente 0 y 1 respectivamente:
print(X_normalized["carat"].mean())
print(X_normalized["carat"].std())


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.677821,0.436318,0.827101,-1.062198,-0.101191,0.695508,0.835699,0.776126,0.789949
1,0.719937,1.411739,-0.93778,-0.482915,-1.216763,-1.550809,0.898013,1.071048,0.804023
2,0.21454,1.411739,-1.526073,-0.482915,-1.007593,2.492562,0.346092,0.420484,0.255168
3,-0.185566,-0.539104,-0.93778,0.096368,0.107979,-1.550809,0.007818,0.021471,0.029996
4,-0.96472,1.411739,-0.349486,-0.482915,-1.216763,2.043298,-1.087121,-1.028107,-1.152153


-8.641379952885163e-17
1.0000123596408295


In [10]:
# Dividir los datos normalizados en set de training de set de test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size = 0.2)

In [None]:
# ENTRENAR
# Ten en cuenta que es un problema de Regresión (no de clasificación) 
from sklearn.linear_model import LogisticRegression

# Creo el modelo
model = LogisticRegression(solver="lbfgs")

# Entreno el modelo
model.fit(X_train, y_train)

# Predicción
#y_pred = model.predict(X_test)

#print(y_pred.head())