# **🗃️ Data Loading**

Useful dataset:
- [Toyota Cars](https://drive.google.com/file/d/1mPrg8J272y9EOOE0GdGjybDCOUmj9Wf6/view?usp=sharing)
- [Houses in Iowa](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data?select=train.csv)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from scipy.stats import probplot, norm

## Load data and take a small look

In [None]:
data = pd.read_csv('train.csv')

print (data.columns)
data.head()

In [None]:
# Print information about data variables
_

In [None]:
# Show descriptive stats about the dataset
_

In [None]:
# sns.scatterplot( _ )
# sns.boxplot( _ )

## Data Cleansing

In [None]:
num_missing = _   # Compute columns with more than 15% missing data
missing_percentage = (data.isnull().sum() / data.isnull().count()).sort_values(ascending=False)
num_missing.head(20)

missing = pd.concat([ _ , _ ], axis = 1, keys=['Total', 'Percentage'])

In [None]:
drop_columns = missing[ _ ] # List columns with those values
print(drop_columns)

In [None]:
data_cl = data.drop(data[(missing[ missing['Percentage']>0.15 ]).index],1 )  # Drop columns

# Show remaining missing
data_cl.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
for col in data_cl.isnull().sum().sort_values(ascending=False).keys().tolist(): # Drop missing values
  data_cl = data_cl.drop(data_cl.loc[data_cl[col].isnull()].index)
  print(col)


In [None]:
# Vefiry missing values
data_cl.isnull().sum().sort_values(ascending=False).min()
print (len(data_cl))

## Outliers

In [None]:
data_cl.columns

In [None]:
scaled_data_cl = StandardScaler().fit_transform(data_cl['SalePrice'].values.reshape(-1,1))

# 10 outliners - we dont have that many
lower_bound = scaled_data_cl[scaled_data_cl[:, 0].argsort()][:10]
upper_bound = scaled_data_cl[scaled_data_cl[:, 0].argsort()][-10:]
print(lower_bound, upper_bound)

In [None]:
sns.histplot(data_cl, x= 'SalePrice', kde=True, bins=len(np.arange(0,len(data_cl),10)))

## Normality test

In [None]:
data_cl.info()
data_cl = data_cl.select_dtypes(include = ['float64', 'int64']) #Convert values to numbers
data_cl.info()

In [None]:
sns.distplot(data_cl['SalePrice'], fit = norm);
fig = plt.figure()
res = probplot(_,  plot = plt)

In [None]:
sns.distplot(data_cl['TotalBsmtSF'], fit = norm);
fig = plt.figure()
res = probplot(data_cl['TotalBsmtSF'], plot = plt)

In [None]:
# Transformación de los datos:
data_cl_tf = data_cl.copy()

for col in data_cl.columns.tolist():
  data_cl_tf[col].loc[data_cl_tf[col] != 0] = np.log(data_cl[col].loc[data_cl[col] != 0]) # Normalize

In [None]:
# Histograma y gráfico de probabilidad normal sobre los datos transformados:

sns.distplot(data_cl_tf['SalePrice'], fit = norm);
fig = plt.figure()
res = probplot(data_cl_tf['SalePrice'] , plot = plt)

In [None]:
sns.distplot(data_cl_tf['TotalBsmtSF'], fit = norm);
fig = plt.figure()
res = probplot(data_cl_tf['TotalBsmtSF'], plot = plt)

In [None]:
data_cl_tf.head()

In [None]:
sns.pairplot(data_cl_tf, corner=True)

In [None]:
data_cl_tf = data_cl_tf.drop(['Id'], axis=1)

## Data Modeling

### First product - Sale price prediction

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [None]:
X = np.array( _ )
y = np.array( _ )

print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)

print ( _ )

In [None]:
gbr = GradientBoostingRegressor(n_estimators=100)
gbr.fit (X_train, y_train)

In [None]:
predictions = gbr.predict( _ )
mse_score = mse(y_test, predictions)

print ("MSE: {0:.5f}".format(mse_score))

In [None]:
_, ax = plt.subplots(figsize=(6,5))

ax.scatter(X_test[:, 0], y_test, facecolors="none", edgecolors='k')
ax.scatter(X_test[:, 0], predictions, alpha=0.5, edgecolors='k')

In [None]:
lrm = LinearRegression()
lrm.fit (X_train, y_train)

In [None]:
predictions = lrm.predict(X_test)
mse_score = mse(y_test, predictions)

print ("MSE: {0:.5f}".format(mse_score))

In [None]:
_, ax = plt.subplots(figsize=(6,5))

ax.scatter(X_test[:, 0], y_test, facecolors="none", edgecolors='k')
ax.scatter(X_test[:, 0], predictions, alpha=0.5, edgecolors='k')

### Second product - Quality estimation

In [None]:
data_cl_tf2 = data_cl_tf.copy()

vals = _
for i, val in enumerate(vals): data_cl_tf2['OverallQual'].replace(val, i, inplace=True)


In [None]:
print (data_cl_tf2.columns)
X = np.concatenate([ _ , _ ], axis=1)
y = np.array( _ )

print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)

print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1070, 33) (268, 33) (1070,) (268,)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix