# Librerias

In [1]:
# Trata de Datos
import pandas as pd
import numpy as np
from utils import data_report
import math

# Visualizaciones
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning


# Carga de Datos

In [2]:
df = pd.read_csv('data/auto-mpg.csv')

# Primera Exploracion

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
df.tail()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger
397,31.0,4,119.0,82,2720,19.4,82,1,chevy s-10


El dataframe MPG tiene 9 columnas y 398 filas de la cuales

- **mpg**: significa millas por galon
- **cylinders**: cantidad de cilindros que tiene el coche
- **displacement**: centimetros cubicos
- **horsepower**: caballos de fuerza
- **weight**: peso del coche
- **acceleration**: aceleracion
- **model year**: año del coche
- **origin**: continente de origen (1:America, 2:Europa, 3:Asia)
- **car name**: nombre del coche

In [5]:
df.shape

(398, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


- Todas las variables son ``numericas`` menos **horsepower** y **car name**
- No hay valores ``nulos``

In [7]:
df.describe(include='all')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398
unique,,,,94.0,,,,,305
top,,,,150.0,,,,,ford pinto
freq,,,,22.0,,,,,6
mean,23.514573,5.454774,193.425879,,2970.424623,15.56809,76.01005,1.572864,
std,7.815984,1.701004,104.269838,,846.841774,2.757689,3.697627,0.802055,
min,9.0,3.0,68.0,,1613.0,8.0,70.0,1.0,
25%,17.5,4.0,104.25,,2223.75,13.825,73.0,1.0,
50%,23.0,4.0,148.5,,2803.5,15.5,76.0,1.0,
75%,29.0,8.0,262.0,,3608.0,17.175,79.0,2.0,


- **horse power** tiene 94 unicos y el valor de ``150`` es el que mas se repite con una frecuencia de 22
- **car name** tiene 305 unicos y el que mas se repite es ```ford pinto`` con una frecuencia 6
- **todas las demas variables** no siguen una aparente distribucion normal 

In [8]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

- **horsepower** deberia ser numerica

In [9]:
df.duplicated().sum()

0

No hay duplicados

In [10]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

No hay nulos

In [11]:
data_report(df)

COL_N,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
DATA_TYPE,float64,int64,float64,object,int64,float64,int64,int64,object
MISSINGS (%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UNIQUE_VALUES,129,5,82,94,351,95,13,3,305
CARDIN (%),32.41,1.26,20.6,23.62,88.19,23.87,3.27,0.75,76.63


|Variable|tipo de variable|metodo a tratar|
|---|---|---|
|**mpg**|continua|mirar si no todos terminan en 0 de caso contrario pasar a int|
|**cylinders**|discreta|ver los unicos y un conteo|
|**displacement**|continua|mirar si no todos terminan en 0 de caso contrario pasar a int|
|**horsepower**|discreta|convertir en int|
|**weight**|continua|nada|
|**acceleration**|continua|nada|
|**model year**|discreta|``agrupar por decada`` o nada|
|**origin**|discreta|nada|
|**car name**|nominal|buscar la marca y eliminar el resto|

# Limpieza

In [12]:
df[~df['mpg'].astype(str).str.endswith('.0')].head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
187,17.5,8,305.0,140,4215,13.0,76,1,chevrolet chevelle malibu classic
189,15.5,8,304.0,120,3962,13.9,76,1,amc matador
190,14.5,8,351.0,152,4215,12.8,76,1,ford gran torino
194,22.5,6,232.0,90,3085,17.6,76,1,amc hornet
196,24.5,4,98.0,60,2164,22.1,76,1,chevrolet woody


In [13]:
df[~df['displacement'].astype(str).str.endswith('.0')].head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
58,25.0,4,97.5,80,2126,17.0,72,1,dodge colt hardtop


In [14]:
# for i in df['horsepower']:
#     if i.isnumeric() == False:
#         print(i)

df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

In [23]:
df['marca'] = df['car name'].apply(lambda x: x.split()[0])
df['marca'].replace({
    "chevroelt": "chevrolet",
    "toyouta": "toyota",
    "vw": "volkswagen",
    "vokswagen": "volkswagen",
    "mercedes" : "mercedes-benz",
    "maxda" : "mazda",
    "capri": "ford",
    "chevy": "chevrolet"    
}, inplace=True)


# EDA

# Conclusiones EDA

# Feature Engineer

# Baseline

# Train Test Split

# Entrenar el Modelo

# Optimizar

# Inferencia

# Conclusiones ML

# Pipeline