In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#Pandas : librairie de manipulation des données

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error,r2_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Lecture du fichier**

In [None]:
df = pd.read_csv("../input/bostonhoustingmlnd/housing.csv")

In [None]:
df.head(10).T

In [None]:
df.info()

In [None]:
df.count()

In [None]:
df['year'] = pd.DatetimeIndex(df['MEDV']).year
df['month'] = pd.DatetimeIndex(df['MEDV']).month

In [None]:
df.groupby(['year','month'])['MEDV'].mean().plot(kind = 'bar', figsize=(12,8))

In [None]:
df.groupby(['year','month'])['RM'].mean().plot(kind = 'bar', figsize=(12,8))

In [None]:
df.groupby(['year','month'])['PTRATIO'].mean().plot(kind = 'bar', figsize=(12,8))

In [None]:
df.groupby(['year','month'])['LSTAT'].count().plot(kind = 'bar', figsize=(12,8))

# **Recherche des corrélations**

In [None]:
tabcorr = df.corr()     

In [None]:
tabcorr

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(abs(tabcorr), cmap="coolwarm")

***Regroupons les paramètres par clausters***

In [None]:
sns.clustermap(abs(tabcorr), cmap="coolwarm")

**Le dendrogramme va nous permettre de voir paramètres les plus corrélés entre eux**

In [None]:
from scipy.cluster import hierarchy as hc

corr = 1 - df.corr()
corr_condensed = hc.distance.squareform(corr)
link = hc.linkage(corr_condensed, method='ward')
plt.figure(figsize=(12,12))
den = hc.dendrogram(link, labels=df.columns, orientation='left', leaf_font_size=10)

In [None]:
correlations = tabcorr.MEDV
print(correlations)

*On observe des corrélations négatives entre MEDV et LSTAT et entre MEDV et PTRATIO*

In [None]:
print(abs(correlations).sort_values(ascending=False))

*La fonction ci-dessus nous a donné les correlations en valeurs absolues , on constate une forte corrélation entre LSTAT et MEDV*

# **Par la méthode de Régression linéaire multiple**

*On vérifie s'il n'y a pas de valeurs nulles*

In [None]:
df.isnull().values.sum()

In [None]:
df.columns

In [None]:
continuous_features = ['LSTAT','PTRATIO','MEDV']
discrete_features = ['RM']

*RM est un caractère discret car il s'agit du nombre de pièces moyennes par logement*

In [None]:
df1 = df[df.MEDV<1000000].drop(discrete_features, axis=1)

In [None]:
X = df1.drop(['MEDV'], axis=1)
y = df1.MEDV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)



In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)            # apprentissage
y_pred = lm.predict(X_test)         # prédiction sur l'ensemble de test

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
plt.xlabel("MEDV")
plt.ylabel("Prediction de MEDV")
plt.title("MEDV reels vs predictions")

*On va utiliser seaborn pour visualiser l'erreur de la distribution*

In [None]:
sns.distplot(y_test-y_pred)

In [None]:
print(np.sqrt(mean_squared_error(y_test, y_pred)))



In [None]:
scoreR2 = r2_score(y_test, y_pred)
print(scoreR2)

*Ou simplement:*

In [None]:
lm.score(X_test,y_test)

# **La méthode de Régression par forêts aléatoires**

In [None]:
X = df.drop(['MEDV'], axis=1)
y = df.MEDV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
print(rf.score(X_test,y_test))



In [None]:
plt.figure(figsize=(12,12))
plt.scatter(y_test, y_rf)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
plt.xlabel("MEDV")
plt.ylabel("Prediction de MEDV")
plt.title("MEDV reels vs predictions")

In [None]:
sns.distplot(y_test-y_rf)

In [None]:
print(np.sqrt(mean_squared_error(y_test, y_rf)))

In [None]:
rf.score(X_test,y_test)

# **La méthode de Extreme Gradient Boost**

*On va utiliser l'outil XGBRegressor*

In [None]:
import xgboost as XGB
xgb  = XGB.XGBRegressor()
xgb.fit(X_train, y_train)
y_xgb = xgb.predict(X_test)
print(xgb.score(X_test,y_test))

plt.figure(figsize=(12,12))
plt.scatter(y_test, y_xgb)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
plt.xlabel("MEDV")
plt.ylabel("Prediction de MEDV")
plt.title("MEDV reels vs predictions")