# **🗃️ Data Loading**

Useful dataset:
- [Toyota Cars](https://drive.google.com/file/d/1mPrg8J272y9EOOE0GdGjybDCOUmj9Wf6/view?usp=sharing)
- [Houses in Iowa](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=train.csv)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from scipy.stats import probplot, norm

import warnings
warnings.filterwarnings('ignore')

## Load data and take a small look

In [None]:
data = pd.read_csv('/content/ToyotaCorolla1.csv')

print (data.columns)
data.head()

In [None]:
# Print information about data variables
_ 

In [None]:
# Show descriptive stats about the dataset
_

_, ax = plt.subplots(figsize=(4,3))
sns.histplot(data, x="Price", kde=True, ax=ax)

In [None]:
model2key = {}

for i, key in enumerate(np.unique(data.Model)): model2key [key] = i
key2model = dict(zip(model2key.values(), model2key.keys()))

data['Model'] = data['Model'].map(model2key)

In [None]:
fuel2key = {}

for i, key in enumerate(np.unique(data.Fuel_Type)): fuel2key [key] = i
key2fuel = dict(zip(fuel2key.values(), fuel2key.keys()))

data['Fuel_Type'] = data['Fuel_Type'].map(fuel2key)

In [None]:
color2key = {}

for i, key in enumerate(np.unique(data.Color)): color2key [key] = i
key2color = dict(zip(color2key.values(), color2key.keys()))

data['Color'] = data['Color'].map(color2key)

In [None]:
# Correlaation plot
corr = _
_, ax = plt.subplots(figsize=(10,10))
_ 

In [None]:
_, ax = plt.subplots(figsize=(10,4))
sns.boxplot(data, x='Age_08_04', y='Price', ax=ax)
_ = plt.xticks(rotation = 90)

In [None]:
# Plot categorical values
_, ax = plt.subplots(figsize=(6,4))
_

In [None]:
# Plot continous values
_, ax = plt.subplots(figsize=(6,5))
_

## Data Cleansing

In [None]:
# Missing data

In [None]:
num_missing = data.isnull().sum().sort_values(ascending=False)   # Compute columns with more than 15% missing data
missing_percentage = (data.isnull().sum() / data.isnull().count()).sort_values(ascending=False)
num_missing.head(20)

missing = pd.concat([ num_missing , missing_percentage ], axis = 1, keys=['Total', 'Percentage'])
missing.head(20)

In [None]:
data_cl = data.drop(missing[missing['Percentage'] > 0.10].index.tolist(), axis=1)
data_cl.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
drop_columns = missing[ missing['Percentage']>0.15 ] # List columns with those values
print(drop_columns)

## Outliers

In [None]:
data.columns

In [None]:
scaled_data = StandardScaler().fit_transform(data['Price'].values.reshape(-1,1))

# 10 outliners - we dont have that many
lower_bound = scaled_data[scaled_data[:, 0].argsort()][:1]
upper_bound = scaled_data[scaled_data[:, 0].argsort()][-20:]
print(lower_bound, upper_bound)

In [None]:
sns.histplot(data, x= 'Price', kde=True, bins=len(np.arange(0,len(data),10)))

## Normality test

In [None]:
data.info()
data_cl = data.select_dtypes(include = ['float64', 'int64']) #Convert values to numbers
data_cl.info()

In [None]:
sns.distplot(data_cl['Price'], fit = norm)
fig = plt.figure()
res = probplot(data_cl['Price'],  plot = plt)

In [None]:
sns.distplot(data_cl['KM'], fit = norm)
fig = plt.figure()
res = probplot(data_cl['KM'], plot = plt)

In [None]:
# Transformación de los datos:
data_cl_tf = data_cl.copy()

for col in data_cl.columns.tolist():
  data_cl_tf[col].loc[data_cl_tf[col] != 0] = np.log(data_cl[col].loc[data_cl[col] != 0]) # Normalize

In [None]:
# Histograma y gráfico de probabilidad normal sobre los datos transformados:

sns.distplot(data_cl_tf['Price'], fit = norm)
fig = plt.figure()
res = probplot(data_cl_tf['Price'] , plot = plt)

In [None]:
sns.distplot(data_cl_tf['KM'], fit = norm)
fig = plt.figure()
res = probplot(data_cl_tf['KM'], plot = plt)

In [None]:
data_cl_tf.head()

In [None]:
sns.pairplot(data_cl_tf.iloc[:, :6], corner=True)

In [None]:
# data_cl_tf.to_csv("ToyotaCorolla_Curated.csv", index=False)

## Data Modeling

### First product - Price prediction

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [None]:
data_cl_tf.columns

In [None]:
# X = np.array( data_cl_tf.iloc[:, :-1] )
# y = np.array( data_cl_tf.iloc[:, -1] )

X = np.array( _ )
y = np.array( _ )

print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = _ 

print ( X_train.shape, X_test.shape, y_train.shape, y_test.shape )

In [None]:
gbr = GradientBoostingRegressor(n_estimators=100)

# Train
_

In [None]:
# Predictions
_

# Metrics
mse_score = _

print ("MSE: {0:.5f}".format(mse_score))

In [None]:
_, ax = plt.subplots(figsize=(6,5))
var_idx = 1
name_var = data_cl_tf.columns[var_idx]

ax.scatter(X_test[:, var_idx], y_test, facecolors="none", edgecolors='k')
ax.scatter(X_test[:, var_idx], predictions, c='b', alpha=0.5, edgecolors='k')

In [None]:
lrm = LinearRegression()
lrm.fit (X_train, y_train)

In [None]:
predictions = lrm.predict(X_test)
mse_score = mse(y_test, predictions)

print ("MSE: {0:.5f}".format(mse_score))

In [None]:
_, ax = plt.subplots(figsize=(6,5))
var_idx = 0
name_var = data_cl_tf.columns[var_idx]

ax.scatter(X_test[:, var_idx], y_test, facecolors="none", edgecolors='k')
ax.scatter(X_test[:, var_idx], predictions, c='b', alpha=0.5, edgecolors='k')