# 1. Incluimos las librerías

In [9]:
# Importar librerías
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# Configurar estilo de gráficos
sns.set(style="whitegrid")
plt.style.use("ggplot")

# 2. Cargamos los datos

In [10]:
# Cargar el dataset
df = pd.read_csv('../vehicles_us.csv')

# Ver dimensiones y tipo de datos
df.info()

# Mostrar primeras filas
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,23/06/2018,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,19/10/2018,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,07/02/2019,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,22/03/2019,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,02/04/2019,28


# 3. Identificar valores nulos y preparar limpieza

In [11]:
# Contar valores nulos por columna
df.isna().sum().sort_values(ascending=False)

# Porcentaje de valores nulos por columna
(df.isna().mean() * 100).sort_values(ascending=False)

is_4wd          50.369723
paint_color     17.985444
odometer        15.316836
cylinders       10.208637
model_year       7.023775
price            0.000000
model            0.000000
condition        0.000000
fuel             0.000000
transmission     0.000000
type             0.000000
date_posted      0.000000
days_listed      0.000000
dtype: float64

# 4. Limpieza inicial

In [12]:
# Reemplazar nulos de is_4wd con 0 (asumimos que no tiene 4WD si está vacío)
df['is_4wd'] = df['is_4wd'].fillna(0)

# Reemplazar nulos de paint_color con 'unknown'
df['paint_color'] = df['paint_color'].fillna('unknown')

# Eliminar filas con valores nulos en odometer, cylinders y model_year
df = df.dropna(subset=['odometer', 'cylinders', 'model_year'])

# Confirmar que ya no hay nulos
df.isna().sum()

price           0
model_year      0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
transmission    0
type            0
paint_color     0
is_4wd          0
date_posted     0
days_listed     0
dtype: int64

# 5. Corregir tipos de dato

In [13]:
# Convertir a enteros
df['model_year'] = df['model_year'].astype(int)
df['cylinders'] = df['cylinders'].astype(int)
df['odometer'] = df['odometer'].astype(int)
df['is_4wd'] = df['is_4wd'].astype(int)

# Convertir a tipo fecha
df['date_posted'] = pd.to_datetime(df['date_posted'], dayfirst=True)

# 6. Análisis estadístico general

In [14]:
# Estadísticas descriptivas para variables numéricas
df.describe()

Unnamed: 0,price,model_year,cylinders,odometer,is_4wd,date_posted,days_listed
count,36419.0,36419.0,36419.0,36419.0,36419.0,36419,36419.0
mean,12187.675417,2009.753425,6.132596,115323.632747,0.497433,2018-10-25 03:09:59.324528384,39.649798
min,1.0,1908.0,3.0,0.0,0.0,2018-05-01 00:00:00,0.0
25%,5000.0,2006.0,4.0,69854.0,0.0,2018-07-29 00:00:00,19.0
50%,9000.0,2011.0,6.0,113000.0,0.0,2018-10-25 00:00:00,33.0
75%,16900.0,2014.0,8.0,155000.0,1.0,2019-01-21 00:00:00,53.0
max,375000.0,2019.0,12.0,990000.0,1.0,2019-04-19 00:00:00,271.0
std,10076.73968,6.265305,1.659646,65068.650067,0.5,,28.119391


# 7. Historgrama de distribución del Odómetro

In [15]:

# Crear histograma con Plotly
fig = go.Figure(data=[go.Histogram(x=df['odometer'])])

# Añadir título y etiquetas si quieres
fig.update_layout(
    title='Distribución del Odómetro',
    xaxis_title='Kilometraje (odometer)',
    yaxis_title='Frecuencia'
)

# Mostrar en Jupyter
fig.show()

# 8. Gráfico de disperción Relación entre precio y Kilometraje

In [16]:
# Crear gráfico de dispersión con Plotly
fig = go.Figure(data=go.Scatter(
    x=df['odometer'],
    y=df['price'],
    mode='markers',
    marker=dict(
        color='blue',
        opacity=0.5
    )
))

fig.update_layout(
    title='Relación entre Precio y Kilometraje',
    xaxis_title='Kilometraje (odometer)',
    yaxis_title='Precio (USD)'
)

fig.show()