# Sessió 1: Introducció a l'Aprenentatge Automàtic

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

### Programació tradicional vs. Aprenentatge automàtic

#### Tradicional

In [None]:
def te_febre(T):
    return 1 if T >= 37 else 0

In [None]:
te_febre(36.5)
, te_febre(35.9)

In [None]:
print(te_febre(37.5))
print(te_febre(35.5))

#### Aprenentatge automàtic

In [None]:
df = pd.DataFrame({'Temperatura': [36.5, 37.5, 35.9], 'Té febre?': [0, 1, 0]})
df

In [None]:
febre_dtree = DecisionTreeClassifier()
febre_dtree.fit(df[["Temperatura"]], df["Té febre?"])
febre_dtree.predict(pd.DataFrame({"Temperatura": [36.5, 37.5, 35.9]}))

In [None]:
plot_tree(febre_dtree, impurity=False, class_names=['No', 'Sí'], feature_names=['Temperatura']);

## Exemple sintètic amb un sinusoide

In [None]:
def true_fun(X):
    return np.cos(1.5 * np.pi * X)

np.random.seed(0)

n_samples = 30
degrees = [1, 4, 15]

X = np.sort(np.random.rand(n_samples))
y = true_fun(X) + np.random.randn(n_samples) * 0.1

### Sobreajust i sotaajust

In [None]:
plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)
    plt.setp(ax, xticks=(), yticks=())

    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline(
        [
            ("polynomial_features", polynomial_features),
            ("linear_regression", linear_regression),
        ]
    )
    pipeline.fit(X[:, np.newaxis], y)

    y_pred = pipeline.predict(X[:, np.newaxis])
    mse = mean_squared_error(y, y_pred)

    X_ticks = np.linspace(0, 1, 100)
    plt.plot(X_ticks, pipeline.predict(X_ticks[:, np.newaxis]), label="Model")
    plt.plot(X_ticks, true_fun(X_ticks), label="True function")
    plt.scatter(X, y, edgecolor="b", s=20, label="Samples")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.xlim((0, 1))
    plt.ylim((-2, 2))
    plt.legend(loc="best")
    plt.title(
        "Degree {}\nMSE = {:.2e}".format(
            degrees[i], mse
        )
    )
plt.show()

### Separació del conjunt de dades

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
plt.figure(figsize=(14, 5))

for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)
    plt.setp(ax, xticks=(), yticks=())

    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X_train[:, np.newaxis], y_train)

    # Evaluate the models using crossvalidation
    scores = cross_val_score(pipeline, X_train[:, np.newaxis], y_train,
                             scoring = "neg_mean_squared_error", cv = 10)

    X_ticks = np.linspace(0, 1, 100)
    plt.scatter(X_train, y_train, edgecolor='b', s=20, label="Train samples")
    plt.scatter(X_test, y_test, color='r', s=20, label = "Test samples")
    plt.plot(X_ticks, pipeline.predict(X_ticks[:, np.newaxis]), label="Model")
    plt.plot(X_ticks, true_fun(X_ticks), label="True function")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.xlim((0, 1))
    plt.ylim((-2, 2))
    plt.legend(loc = "best")
    plt.title("Degree {}\nMSE = {:.5} (+/- {:.2})".format(
        degrees[i], -scores.mean(), scores.std()))
plt.show()

#### Extra: coeficients del model

In [None]:
polynomial_features = PolynomialFeatures(degree=4, include_bias=False)
poly = polynomial_features.fit_transform(X_train[:, np.newaxis])
linear_regression.fit(poly, y_train)
linear_regression.intercept_, linear_regression.coef_