<h1 align=center><font size=6>Predicción del precio de un vehículo🚗</font></h1>

### Crear el dataframe

In [147]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression

In [148]:
# Crear el dataframe
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
cars = pd.read_csv(url, header=None)  

In [149]:
# Crear lista con los nombres de las columnas y reemplazar en el dataframe
headers = ["symboling","normalized-losses","make","fuel-type","aspiration","num-of-doors","body-style",
"drive-wheels","engine-location","wheel-base","length","width","height","curb-weight", "engine-type",
"num-of-cylinders","engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
"peak-rpm","city-mpg","highway-mpg","price"]
cars.columns = headers

In [125]:
# Imprimir las primeras cinco filas
cars.head(5)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


### Identificar los valores faltantes

Algunas columnas tienen datos sin valor que aparecen como "?", se debe reemplazar por NaN el cual es el marcador por defecto de Python para valores faltantes por razones de conveniencia y velocidad de computo. 

In [126]:
cars.replace("?", np.nan, inplace=True)
cars.head(5)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [127]:
# Crear un dataframe de booleanos
missing_data = cars.isnull()
missing_data.head(5)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [128]:
# contar datos faltantes por columna
for column in missing_data:
    print(column)
    print(missing_data[column].value_counts()) 

symboling
False    205
Name: symboling, dtype: int64
normalized-losses
False    164
True      41
Name: normalized-losses, dtype: int64
make
False    205
Name: make, dtype: int64
fuel-type
False    205
Name: fuel-type, dtype: int64
aspiration
False    205
Name: aspiration, dtype: int64
num-of-doors
False    203
True       2
Name: num-of-doors, dtype: int64
body-style
False    205
Name: body-style, dtype: int64
drive-wheels
False    205
Name: drive-wheels, dtype: int64
engine-location
False    205
Name: engine-location, dtype: int64
wheel-base
False    205
Name: wheel-base, dtype: int64
length
False    205
Name: length, dtype: int64
width
False    205
Name: width, dtype: int64
height
False    205
Name: height, dtype: int64
curb-weight
False    205
Name: curb-weight, dtype: int64
engine-type
False    205
Name: engine-type, dtype: int64
num-of-cylinders
False    205
Name: num-of-cylinders, dtype: int64
engine-size
False    205
Name: engine-size, dtype: int64
fuel-system
False    205
Name: 

De acuerdo a lo anterior, tenemos entonces que todas las columnas tienen 205 filas, con datos faltantes en las siguientes:</p> 

- "normalized-losses": 41 datos faltantes
- "num-of-doors": 2 datos faltantes
- "bore": 4 datos faltantes
- "stroke" : 4 datos faltantes
- "horsepower": 2 datos faltantes
- "peak-rpm": 2 datos faltantes
- "price": 4 datos faltantes


### Reemplazar o eliminar datos faltantes 
Primero podemos obtener una descripción de los datos y su tipo para ver su comportamiento.

In [129]:
cars.describe(include="all")

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,205.0,164.0,205,205,205,203,205,205,205,205.0,...,205.0,205,201.0,201.0,205.0,203.0,203.0,205.0,205.0,201.0
unique,,51.0,22,2,2,2,5,3,2,,...,,8,38.0,36.0,,59.0,23.0,,,186.0
top,,161.0,toyota,gas,std,four,sedan,fwd,front,,...,,mpfi,3.62,3.4,,68.0,5500.0,,,8921.0
freq,,11.0,32,185,168,114,96,120,202,,...,,94,23.0,20.0,,19.0,37.0,,,2.0
mean,0.834146,,,,,,,,,98.756585,...,126.907317,,,,10.142537,,,25.219512,30.75122,
std,1.245307,,,,,,,,,6.021776,...,41.642693,,,,3.97204,,,6.542142,6.886443,
min,-2.0,,,,,,,,,86.6,...,61.0,,,,7.0,,,13.0,16.0,
25%,0.0,,,,,,,,,94.5,...,97.0,,,,8.6,,,19.0,25.0,
50%,1.0,,,,,,,,,97.0,...,120.0,,,,9.0,,,24.0,30.0,
75%,2.0,,,,,,,,,102.4,...,141.0,,,,9.4,,,30.0,34.0,


In [130]:
cars.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

Podemos entonces decidir lo siquiente:</p> 

**Reemplazar con la media:**
- "normalized-losses"
- "bore"
- "stroke"
- "horsepower"
- "peak-rpm"
    
 </p> 
    
**Reemplazar con la frecuencia:** 
- "num-of-doors": se puede reemplazar con "four" ya que es el dato que más se repite
    </p> 
    
**Eliminar datos:**  
- "price": se eliminan las filas con datos faltantes ya que es la variable que queremos predecir. </p> 

**Nota:** Es importante al momento de calcular la media asignarle tipo "float", ya que tienen asigando erróneamente sus tipos.

In [143]:
# eliminar filas de la columna price
cars.dropna(subset=["price"], inplace=True)

# reemplazar con la media, asignar tipo "float" para que lo pueda calcular ya que tiene asignado erroneamente su tipo
avr_norm_loss = cars["normalized-losses"].astype("float64").mean()
cars["normalized-losses"].replace(np.nan, avr_norm_loss, inplace=True)

avr_bore = cars["bore"].astype("float64").mean()
cars["bore"].replace(np.nan, avr_bore, inplace=True)

avr_stroke = cars["stroke"].astype("float64").mean()
cars["stroke"].replace(np.nan, avr_stroke, inplace=True)

avr_horsepower = cars["horsepower"].astype("float64").mean()
cars["horsepower"].replace(np.nan, avr_horsepower, inplace=True)

avr_rpm = cars["peak-rpm"].astype("float64").mean()
cars["peak-rpm"].replace(np.nan, avr_rpm, inplace=True)

# reemplazar con la frecuencia
freq_num_doors = df['num-of-doors'].value_counts().idxmax()
cars["num-of-doors"].replace(np.nan, freq_num_doors, inplace=True)


###  Corregir el formato de datos:
Como se vió anteiormente, algunas columnas tienen un tipo de dato erróneo, así que se debe corregir entonces:
- "normalized-losses": object -> int64
- "bore": object -> float64
- "stroke": object -> float64
- "horsepower": object -> int64
- "peak-rpm": object -> int64
- "price": object -> int64


In [144]:
cars["normalized-losses"] = cars["normalized-losses"].astype("int64")
cars["bore"] = cars["bore"].astype("float64")
cars["stroke"] = cars["stroke"].astype("float64")
cars["horsepower"] = cars["horsepower"].astype("int64")
cars["peak-rpm"] = cars["peak-rpm"].astype("int64")
cars["price"] = cars["price"].astype("int64")

In [145]:
cars.describe(include="all")

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,201.0,201.0,201,201,201,201,201,201,201,201.0,...,201.0,201,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0
unique,,,22,2,2,2,5,3,2,,...,,8,,,,,,,,
top,,,toyota,gas,std,four,sedan,fwd,front,,...,,mpfi,,,,,,,,
freq,,,32,181,165,115,94,118,198,,...,,92,,,,,,,,
mean,0.840796,122.0,,,,,,,,98.797015,...,126.875622,,3.330711,3.256904,10.164279,103.393035,5117.58209,25.179104,30.686567,13207.129353
std,1.254802,31.99625,,,,,,,,6.066366,...,41.546834,,0.268072,0.316048,4.004965,37.365623,478.113182,6.42322,6.81515,7947.066342
min,-2.0,65.0,,,,,,,,86.6,...,61.0,,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,0.0,101.0,,,,,,,,94.5,...,98.0,,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7775.0
50%,1.0,122.0,,,,,,,,97.0,...,120.0,,3.31,3.29,9.0,95.0,5117.0,24.0,30.0,10295.0
75%,2.0,137.0,,,,,,,,102.4,...,141.0,,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16500.0


Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,122,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
5,2,122,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250
6,1,158,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
7,1,122,audi,gas,std,four,wagon,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,18920
8,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
10,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430
