In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
vehicles = pd.read_csv('../input/craigslist-carstrucks-data/vehicles.csv')
vehicles.info()
vehicles.head()

# Limpiar datos

In [None]:
vehicles.drop(
    [
        'id',
        'url',
        'region_url',
        'VIN', 
        'drive', 
        'size', 
        'county', 
        'state',
        'paint_color',
        'image_url',
        'description',
        'posting_date'
    ], 
    axis=1, 
    inplace=True
)

In [None]:
vehicles.drop(
    [
        'region',
        'model',
        'lat',
        'long'
    ], 
    axis=1, 
    inplace=True
)

In [None]:
print(vehicles.isnull().sum())

In [None]:
vehicles_no_null = vehicles
vehicles_no_null = vehicles_no_null[(vehicles_no_null['year'].notnull())]
vehicles_no_null = vehicles_no_null[(vehicles_no_null['manufacturer'].notnull())]
vehicles_no_null = vehicles_no_null[(vehicles_no_null['condition'].notnull())]
vehicles_no_null = vehicles_no_null[(vehicles_no_null['cylinders'].notnull())]
vehicles_no_null = vehicles_no_null[(vehicles_no_null['fuel'].notnull())]
vehicles_no_null = vehicles_no_null[(vehicles_no_null['odometer'].notnull())]
vehicles_no_null = vehicles_no_null[(vehicles_no_null['title_status'].notnull())]
vehicles_no_null = vehicles_no_null[(vehicles_no_null['transmission'].notnull())]
vehicles_no_null = vehicles_no_null[(vehicles_no_null['type'].notnull())]

print(vehicles_no_null.isnull().sum())

# Categorizacion de los datos

In [None]:
vehicles_no_null.dtypes

In [None]:
categoricals = [
    'manufacturer','condition','cylinders','fuel','title_status','transmission','type'
]
categories_values = []

for cat in categoricals:
    vehicles_no_null[cat] = vehicles_no_null[cat].astype('category')
    categories_values.append(dict(enumerate(vehicles_no_null[cat].cat.categories)))
    vehicles_no_null[cat] = vehicles_no_null[cat].cat.codes

# Definicion de X y Y

In [None]:
X = vehicles_no_null.drop(['price'], axis=1)
y = vehicles_no_null['price']

# Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfmodel = RandomForestRegressor(random_state=0)
rfmodel.fit(X_train, y_train)

In [None]:
rfmodel.score(X_train, y_train)

In [None]:
rfmodel.score(X_test, y_test)

# MSE

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_train, rfmodel.predict(X_train))

#Test
mse_test = mean_squared_error(y_test, rfmodel.predict(X_test))

print(f"Random Forest Regressor MSE train: {mse}")

print(f"Random Forest Regressor MSE test: {mse_test}")


# MAE

In [None]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_train, rfmodel.predict(X_train))

#Test
mae_test = mean_absolute_error(y_test, rfmodel.predict(X_test))

print(f"Random Forest Regressor MAE train: {mae}")

print(f"Random Forest Regressor MAE test: {mae_test}")


# Parte 2

# Completar valores vacios

In [None]:
print(vehicles.isnull().sum())

In [None]:
import ipywidgets as widgets
from ipywidgets import interact
import plotly.express as px

In [None]:
@interact
def describe(column=list(vehicles.columns)):
    print(vehicles[column].unique())

In [None]:
vehicles_sample = vehicles.sample(5000)

In [None]:
categoricals = [
    'manufacturer','condition','cylinders','fuel','title_status','transmission','type'
]
categories_values = []

for cat in categoricals:
    vehicles_sample[cat] = vehicles_sample[cat].astype('category')
    categories_values.append(dict(enumerate(vehicles_sample[cat].cat.categories)))
    vehicles_sample[cat] = vehicles_sample[cat].cat.codes

In [None]:
categories_values[5] 

In [None]:
@interact
def explore(
    column_x=list(vehicles.columns),
    column_y=list(vehicles.columns),
    column_c=list(vehicles.columns)):
    print(f"X: {column_x}, Y: {column_y}, Color: {column_c}")
    
    fig = px.scatter(
        vehicles_sample, 
        x=column_x, 
        y=column_y, 
        color=column_c, 
        marginal_y="violin",
        marginal_x="box", 
        trendline="ols", 
        template="simple_white"
    )
    
    fig.show()

In [None]:
categoricals = [
    'manufacturer','condition','cylinders','fuel','title_status','transmission','type'
]
categories_values = []

for cat in categoricals:
    vehicles[cat] = vehicles[cat].astype('category')
    categories_values.append(dict(enumerate(vehicles[cat].cat.categories)))
    vehicles[cat] = vehicles[cat].cat.codes

- price                0  
- year              1205 #- Se completan los nulos con el valor promedio 
- manufacturer     17646 # Random porque los datos estan distribuidos entre las categorias
- condition       174104 #- Se llenan todos los valores con la categoria 2 que es good
- cylinders       177678 #- Se llenan los datos con 6 cilindros que es la mayor cantidad de valores
- fuel              3013 # Random porque los datos estan muy distribuidos entre las categogiras
- odometer          4400 #- Se completan los valores con el valor promedio
- title_status      8242 #- Se completan los datos con clean que es el valor donde se encuentran la mayoria de los datos
- transmission      2556 #- Se completan los datos con manual que es la mayor cantidad de valores
- type             92858 # Random porque los datos estan muy distribuidos entre las categogiras

In [None]:
vehicles['year'] = vehicles['year'].fillna(vehicles['year'].mean())
vehicles['condition'] = vehicles['condition'].fillna('2')
vehicles['cylinders'] = vehicles['cylinders'].fillna('5')
vehicles['odometer'] = vehicles['odometer'].fillna(vehicles['odometer'].mean())
vehicles['title_status'] = vehicles['title_status'].fillna('0')
vehicles['transmission'] = vehicles['transmission'].fillna('1')

In [None]:
def na_randomfill(series):
    na_mask = pd.isnull(series)   # boolean mask for null values
    n_null = na_mask.sum()        # number of nulls in the Series
    
    if n_null == 0:
        return series             # if there are no nulls, no need to resample
    
    # Randomly sample the non-null values from our series
    #  only sample this Series as many times as we have nulls 
    fill_values = series[~na_mask].sample(n=n_null, replace=True, random_state=0)

    # This ensures our new values will replace NaNs in the correct locations
    fill_values.index = series.index[na_mask]
    
    return series.fillna(fill_values) 

In [None]:
vehicles['manufacturer'] = na_randomfill(vehicles['manufacturer'])
vehicles['fuel'] = na_randomfill(vehicles['fuel'])
vehicles['type'] = na_randomfill(vehicles['type'])

In [None]:
print(vehicles.isnull().sum())

## Definicion de X y Y

In [None]:
X = vehicles.drop(['price'], axis=1)
y = vehicles['price']

## Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

## RandomForestRegressor

In [None]:
rfmodel = RandomForestRegressor(random_state=0)
rfmodel.fit(X_train, y_train)

In [None]:
rfmodel.score(X_train, y_train)

In [None]:
rfmodel.score(X_test, y_test)

## MSE

In [None]:
mse = mean_squared_error(y_train, rfmodel.predict(X_train))

#Test
mse_test = mean_squared_error(y_test, rfmodel.predict(X_test))

print(f"Random Forest Regressor MSE train: {mse}")

print(f"Random Forest Regressor MSE test: {mse_test}")


## MAE

In [None]:
mae = mean_absolute_error(y_train, rfmodel.predict(X_train))

#Test
mae_test = mean_absolute_error(y_test, rfmodel.predict(X_test))

print(f"Random Forest Regressor MAE train: {mae}")

print(f"Random Forest Regressor MAE test: {mae_test}")

## KKN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knnmodel = KNeighborsClassifier(n_neighbors=5)
knnmodel.fit(X_train, y_train)

In [None]:
knnmodel.score(X_train, y_train)

In [None]:
knnmodel.score(X_test, y_test) # Al no ser un modelo sobreajustado el score del test es mucho mejor que los arboles aleatorios

## MSE

In [None]:
mse = mean_squared_error(y_train, knnmodel.predict(X_train))

#Test
mse_test = mean_squared_error(y_test, knnmodel.predict(X_test))

print(f"KKN MSE train: {mse}")

print(f"KKN MSE test: {mse_test}")

## MAE

In [None]:
mae = mean_absolute_error(y_train, knnmodel.predict(X_train))

#Test
mae_test = mean_absolute_error(y_test, knnmodel.predict(X_test))

print(f"KKN MAE train: {mae}")

print(f"KKN MAE test: {mae_test}")

## Regresion lineal

In [None]:
from sklearn.linear_model import LinearRegression

model_lg = LinearRegression()
model_lg.fit(X_train, y_train)

In [None]:
model_lg.score(X_train, y_train)

In [None]:
model_lg.score(X_test, y_test)