## Importations des librairies courantes

In [None]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score
import os
print(os.listdir("../input"))
from pandas import read_csv
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = read_csv('../input/boston-house-prices/housing.csv', header=None, delimiter=r"\s+", names=column_names)
print(data.head(5))

## Lecture du fichier

In [None]:
df.head(10)

* CRIM per capita crime rate by town
* ZN proportion of residential land zoned for lots over 25,000 square feet
* INDUS proportion of non-retail business acres per town
* CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
* NOX nitric oxides concentration (parts per 10 million)
* RM average number of rooms per dwelling
* AGE proportion of owner-occupied units built prior to 1940
* DIS weighted distances to five Boston employment centres
* RAD index of accessibility to radial highways
* TAX full-value property-tax rate per 10,000 USD
* PTRATIO pupil-teacher ratio by town
* B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* LSTAT % lower status of the population
* MEDV - Median value of owner-occupied homes in 1000's USD

In [None]:
df.info()

In [None]:
df.count()

## Recherche de corrélations

In [None]:
tabcorr = df.corr()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(abs(tabcorr), cmap="coolwarm")

### Regroupement des paramètres par clusters classés par proximité

In [None]:
sns.clustermap(abs(tabcorr), cmap="coolwarm")

### Dendrogramme des corrélations entre les caractéristiques

In [None]:
from scipy.cluster import hierarchy as hc

corr = 1 - df.corr()
corr_condensed = hc.distance.squareform(corr)
link = hc.linkage(corr_condensed, method='ward')
plt.figure(figsize=(12,12))
den = hc.dendrogram(link, labels=df.columns, orientation='left', leaf_font_size=10)

On s'intéresse plus précisement à la correlation par rapport à la valeur des maisons

In [None]:
correlations = tabcorr.MEDV
print(correlations)

On élimine la ligne MEDV elle-même (qui est forcément à 1)

In [None]:
correlations = correlations.drop(['MEDV'],axis=0)

On considère les valeurs absolues, et on trie par ordre décroissant afin de montrer la corrélation la plus forte

In [None]:
print(abs(correlations).sort_values(ascending=False))

On voit que la plus forte corrélation est LSTAT soit le pourcentage de statut inférieur de la population

## Régression linéaire multiple

In [None]:
X = df.drop(['MEDV'], axis=1)
y = df.MEDV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

On utilise la fonction de régression linéaire multiple de sklearn 

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)
y_pred = lm.predict(X_test)

On trace le nuage de points pour comparer la prédiction et les résultats attendus

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
plt.xlabel("Prix")
plt.ylabel("Prediction de prix")
plt.title("Prix reels vs predictions")

Visualisation de la distribution de l'erreur avec seaborn

In [None]:
sns.distplot(y_test-y_pred)

Calcule l'erreur sur les moindres carrés

In [None]:
print(np.sqrt(mean_squared_error(y_test, y_pred)))

Score R2 (rapport des variances estimée/réelle) 

In [None]:
scoreR2 = r2_score(y_test, y_pred)
print(scoreR2)

## Régression par forêts aléatoires

In [None]:
X = df.drop(['MEDV'], axis=1)
y = df.MEDV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
print(rf.score(X_test,y_test))

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(y_test, y_rf)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
plt.xlabel("Prix")
plt.ylabel("Prediction de prix")
plt.title("Prix reels vs predictions")

In [None]:
sns.distplot(y_test-y_rf)

In [None]:
print(np.sqrt(mean_squared_error(y_test, y_rf)))

## XGBRegressor

In [None]:
import xgboost as XGB
xgb  = XGB.XGBRegressor()
xgb.fit(X_train, y_train)
y_xgb = xgb.predict(X_test)
print(xgb.score(X_test,y_test))

plt.figure(figsize=(12,12))
plt.scatter(y_test, y_xgb)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
plt.xlabel("Prix")
plt.ylabel("Prediction de prix")
plt.title("Prix reels vs predictions")