In [1]:
### import
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
### carico i dati
df = pd.read_csv('life_SENZA_NaN.csv')

FileNotFoundError: ignored

In [None]:
df.info()

In [None]:
df = df.drop('Unnamed: 0', axis=1)

In [None]:
### train/test split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.25, shuffle=True, random_state=22)

# EDA

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
### target
target_name = 'Life expectancy '

sns.displot(train_df[target_name])

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train_df.corr(), annot=True, fmt='.2f')
plt.show()

## Features importances con Pearson corr

In [None]:
corr_matrix = train_df.corr()
sale_corr = corr_matrix[target_name].abs().sort_values(ascending=False)

plt.figure(figsize=(10,7))
sns.barplot(x=sale_corr.index, y=sale_corr)
plt.xticks(rotation=90, fontsize=12)
plt.show()

In [None]:
categorical_features = train_df[[col for col in train_df.columns if train_df[col].dtype==object]].columns

In [None]:
### YearBuilt VS SalePrice

for feature in categorical_features:
  plt.figure(figsize=(16,9))
  sns.boxplot(x=feature, y=target_name, data=train_df)
  plt.title(feature)
  plt.xticks(rotation=90, fontsize=13);

In [None]:
train_df[train_df['Country']=="Israel"]

In [None]:
### metto in ordine per paese, così, per curiosità
country_dict = dict()
for country in set(train_df['Country']):
  country_dict[country] = train_df[train_df['Country']==country][target_name].mean()

In [None]:
country_series = pd.Series(country_dict)
country_series.sort_values(ascending=False)

## Features importances con Random Forest

In [None]:
### fitting
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model = model.fit(train_df.drop([target_name]+list(categorical_features), axis=1), train_df[target_name])

In [None]:
importances = pd.Series(model.feature_importances_, index=model.feature_names_in_).sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x=importances.index, y=importances)
plt.xticks(rotation=90, fontsize=12)
plt.show()

# MODELING

## Features Selection

In [None]:
importances

In [None]:
features = list(importances[:].index)
features

In [None]:
X_train = train_df[features].copy()
X_test = test_df[features].copy()

y_train = train_df[target_name].copy()
y_test = test_df[target_name].copy()

## Models

### Linear Regression

In [None]:
### LINEAR REGRESSION
for n_features in range(1,len(train_df.columns)):
  features = list(importances[:n_features].index)
  from sklearn.linear_model import LinearRegression

  X_train = train_df[features].copy()
  X_test = test_df[features].copy()

  y_train = train_df[target_name].copy()
  y_test = test_df[target_name].copy()

  model = LinearRegression()
  model = model.fit(X_train, y_train)

  train_preds = model.predict(X_train)
  test_preds = model.predict(X_test)

  ### evaluation
  from sklearn.metrics import r2_score

  r2_train = r2_score(y_train, train_preds)
  r2_test = r2_score(y_test, test_preds)

  print(n_features)
  print(r2_train)
  print(r2_test)
  print()

### k-NN

In [None]:
### LINEAR REGRESSION

from sklearn.neighbors import KNeighborsRegressor

r2_train_list = list()
r2_test_list = list()
for k in range(1,20):
    model = KNeighborsRegressor(n_neighbors=k)
    model = model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    ### evaluation
    from sklearn.metrics import r2_score

    r2_train = r2_score(y_train, train_preds)
    r2_test = r2_score(y_test, test_preds)

    r2_train_list += [r2_score(y_train, train_preds)]
    r2_test_list += [r2_score(y_test, test_preds)]

plt.plot(range(1,20), r2_train_list, label='TRAIN')
plt.plot(range(1,20), r2_test_list, label='TEST')
plt.xlabel('k-neighbors')
plt.ylabel('r2')
plt.legend()
plt.show()

In [None]:
k=4
model = KNeighborsRegressor(n_neighbors=k)
model = model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

### evaluation
from sklearn.metrics import r2_score

print(r2_score(y_train, train_preds))
print(r2_score(y_test, test_preds))

### Random Forest

In [None]:
### TUNING max_depth

from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

r2_train_list = list()
r2_test_list = list()
max_depth_range = range(1,20)

for d in tqdm(max_depth_range):
    model = RandomForestRegressor(max_depth=d)
    model = model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    ### evaluation
    from sklearn.metrics import r2_score

    r2_train = r2_score(y_train, train_preds)
    r2_test = r2_score(y_test, test_preds)

    r2_train_list += [r2_score(y_train, train_preds)]
    r2_test_list += [r2_score(y_test, test_preds)]

plt.plot(max_depth_range, r2_train_list, label='TRAIN')
plt.plot(max_depth_range, r2_test_list, label='TEST')
plt.xlabel('max_depth')
plt.ylabel('r2')
plt.legend()
plt.show()

In [None]:
### TUNING n_estimators

r2_train_list = list()
r2_test_list = list()
n_estimators_range = range(1,20)

for n in n_estimators_range:
    model = RandomForestRegressor(n_estimators=n)
    model = model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    ### evaluation
    from sklearn.metrics import r2_score

    r2_train = r2_score(y_train, train_preds)
    r2_test = r2_score(y_test, test_preds)

    r2_train_list += [r2_score(y_train, train_preds)]
    r2_test_list += [r2_score(y_test, test_preds)]


plt.plot(n_estimators_range, r2_train_list, label='TRAIN')
plt.plot(n_estimators_range, r2_test_list, label='TEST')
plt.xlabel('n_estimators')
plt.ylabel('r2')
plt.legend()
plt.show()

In [None]:
### final model

model = RandomForestRegressor(n_estimators=10, max_depth=7)
model = model.fit(X_train, y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

### evaluation
from sklearn.metrics import r2_score

print(r2_score(y_train, train_preds))
print(r2_score(y_test, test_preds))

##Variabili Categoriche

In [None]:
categorical_features =[col for col in train_df.columns if train_df[col].dtype==object]
print(categorical_features)

In [None]:
numerical_features =[col for col in train_df.columns if train_df[col].dtype!=object]
print(numerical_features)

In [None]:
#label encoder x country , quando abbiamo molti valori x country
from sklearn.preprocessing import LabelEncoder

In [None]:
enc =LabelEncoder()
enc.fit_transform(train_df["Country"])

In [None]:
train_df["Country"] = enc.fit_transform(train_df["Country"])

In [None]:
#one hot encoder quando ne ho pochi esempio in Status
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit_transform(train_df["Status"].to_numpy().reshape(-1,1)).toarray()


In [None]:
#

## Previsioni Finali

In [None]:
hold_out_df = pd.read_csv('life_hold_out.csv')

In [None]:
### predictions
X_hold_out = hold_out_df[features].copy()
new_predictions = model.predict(X_hold_out)
new_predictions = pd.Series(new_predictions, index=hold_out_df.index)

In [None]:
targets = pd.read_csv('life_hold_out_targets.csv')['Life expectancy ']

In [None]:
metrics = evaluate(targets, new_predictions, title='Predictions', plots=True, verbose=True)

# CROSS-VALIDATION

Per evitare problemi di overfitting è pratica comune usare un metodo chiamato 'cross-validation'. In pratica, in fase di fitting del modello, invece di fare un solo splitting tra dati di training e dati di test, si divide il dataset in più split e si valutano le performance del modello per ogni split. Scikit-learn integra già delle funzioni di cross-validation in ogni modello.

## K-Fold

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict

results = cross_validate(model, X_train, y_train, cv=KFold(n_splits=20), scoring=['r2'], return_train_score=True)

In [None]:
display(results)

In [None]:
### Per trovare il valore finale mi basta fare una media:

train_r2 = np.mean(results['train_r2'])
test_r2 = np.mean(results['test_r2'])
print('Train r2: ', round(train_r2, 3))
print('Test r2: ', round(test_r2, 3))

In [None]:
### Volendo posso anche farmi dare più metriche di scoring dalla cross-validation. Per vedere tutte quelle predefinite basta eseguire questo snippet

sklearn.metrics.SCORERS.keys()

In [None]:
### con due metriche di scoring
results = cross_validate(model, X_train, y_train, cv=KFold(n_splits=20), scoring=['r2','neg_mean_absolute_error'], return_train_score=True)
display(results)

## ShuffleSplit

In [None]:
### Per cambiare metodo di splitting basta cambiare il parametro cv

from sklearn.model_selection import ShuffleSplit

results = cross_validate(model, X_train, y_train, cv=ShuffleSplit(n_splits=10), scoring=['r2'], return_train_score=True)
train_r2 = np.mean(results['train_r2'])
test_r2 = np.mean(results['test_r2'])
print('Train r2: ', round(train_r2, 3))
print('Test r2: ', round(test_r2, 3))

## Model-selection con CV

Riprendendo il tuning di n_estimators...

In [None]:
### TUNING n_estimators

r2_train_list = list()
r2_test_list = list()
n_estimators_range = range(1,20)

for n in tqdm(n_estimators_range):
    model = RandomForestRegressor(n_estimators=n)
    model = model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    results = cross_validate(model, X_train, y_train, cv=KFold(n_splits=10), scoring=['r2'], return_train_score=True)
    train_r2 = np.mean(results['train_r2'])
    test_r2 = np.mean(results['test_r2'])

    r2_train_list += [train_r2]
    r2_test_list += [test_r2]


plt.plot(n_estimators_range, r2_train_list, label='TRAIN')
plt.plot(n_estimators_range, r2_test_list, label='TEST')
plt.xlabel('n_estimators')
plt.ylabel('r2')
plt.legend()
plt.show()

# HYPER-PARAMETERS TUNING

Come abbiamo visto il processo di tuning può essere fatto anche 'a spanne' valutando le performance di un modello al variare di ogni singolo  parametro del modello (vedi max_depth e n_estimators per il RandomForest). Quando però i parametri cominciano a essere molti, diventa difficile fare una valutazione oggettiva. Bisognerebbe poter valutare le performance del modello per ogni signola combinazione dei suoi iper-parametri. In effetti questo è proprio quello che fa GridSearchCV(): esegue la valutazione delle performance (in cross-validation, per questo c'è il CV) per ogni permutazione degli iper-parametri del modello.

Questo metodo è ovviamente molto dispendioso dal punto di vista computazionale, e diventa tanto più dispendioso quanto più aumentano i valori associati a ogni singolo parametro (il cosiddetto spazio dei parametri). Per questo esistono anche altri metodi per indirizzare la ricerca del miglior modello nello spazio dei parametri. Noi qui vedremo solo RandomizedSearchCV(), che di fatto non esegue ogni singola permutazione ma fa una ricerca 'randomizzata'.

## GridSearchCV()

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'n_estimators':np.arange(1,10), 
    'max_depth':np.arange(1,10),
    'max_features':['auto', 'sqrt', 'log2'],
    }

model = GridSearchCV(RandomForestRegressor(), param_grid=parameters, cv=KFold(n_splits=10), scoring='r2')

In [None]:
model = model.fit(X_train, y_train)

In [None]:
model.best_estimator_

In [None]:
model.best_score_

## RandomizedSearchCV()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {
    'n_estimators':np.arange(1,10), 
    'max_depth':np.arange(1,10),
    'max_features':['auto', 'sqrt', 'log2']
    }


model = RandomizedSearchCV(RandomForestRegressor(), param_distributions=parameters, cv=KFold(n_splits=10), scoring='r2')

In [None]:
model = model.fit(X_train, y_train)

In [None]:
model.best_estimator_

In [None]:
model.best_score_