In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the Data

In [None]:
data = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')

# Data Exploration

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
data['ocean_proximity'].value_counts()

In [None]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

#load file from image into a numpy.array (read pngs preferably)
california_img = mpimg.imread('/kaggle/input/california-housing-feature-engineering/california.png')
data.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4, s=data['population']/100, label='population', figsize=(10,7), c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
#kind: scatter = nuage de points
# alpha: transparence
# figsize: width, height in inches
# edgecolors='blue'
# linewidth=2
# cmap: A Colormap instance or registered colormap name. cmap is only used if c is an array of floats. If None, defaults to rc image.cmap
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)
plt.show()

## Histograms 

In [None]:
_ = data.hist(bins=50, figsize=(20,15))
#bins: In a histogram, the total range of data set (i.e from minimum value to maximum value) is divided into 8 to 15 equal parts. 
#These equal parts are known as bins or class intervals.
#figsize: figure size

In [None]:
data[data['median_house_value'] >= 500001].count()

# Data Preprocessing

## Capped variables 

Delete variables whose median_house_value >= 500001

Reason: this data is not meaningful because the rest of houses are at the same point 50 0001

In [None]:
data_capped = data[data['median_house_value'] < 500001]
data_capped.shape
# version sans nouveau dataframe
#index = data[(data['median_house_value'] < 500001)].index
# data.drop(index, inplace=True)


Vérification qu'on a bien supprimé les capped values

In [None]:
data_capped[data_capped['median_house_value'] >= 500001].count()

## Missing values

### Summarizing missing values
1. Quelles catégories ont le plus de missing values


In [None]:
# clean_data = data.dropna()
# missing_values = ["n/a", "na", " ", ""]
# df = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv', na_values=missing_values)
# df.isnull().sum()

data_capped.isnull().sum()

- on a juste une catégorie 'total_bedrooms' qui possède des missing values: 
    - peut être à cause des missing values qui ne sont pas reconnues dans les autres catégories
- quelle proportion représentent les missing values sur le total de rows ? réponse 1% -> pas très significatif surtout que ce sont 1% répartis sur plusieurs catégories

In [None]:
#total le .sum() donne par colomnes, le .sum().sum() donne la somme des colonnes
data_capped.isnull().sum().sum()

In [None]:
round(data_capped.isnull().sum().sum() / 20640, 2)

2. Mapper les missing values
On fait un dataframe qu'avec les 200 missing values et on les map en fonction de plusieurs index pour voir s'ils sont well-spread sur la totalité des valeurs possibles et pas seulement à un endroit précis
problème: si les missing values sont toutes au meme endroit géographiquement, alors on perd en fiabilité pour la zone considérée.

In [None]:
data_capped.isna()

In [None]:
import seaborn as sns; sns.set()
# %matplolib inlineb
data_missing = data_capped[data_capped.total_bedrooms.isnull()]
data_missing.head()


In [None]:

heatmap_missing_data1 = pd.pivot_table(data_missing, values='median_house_value', index=['longitude'], columns='latitude')
sns.heatmap(heatmap_missing_data1, cmap="YlGnBu")


In [None]:
heatmap_missing_data2 = pd.pivot_table(data_missing, values='housing_median_age', index=['longitude'], columns='latitude')
sns.heatmap(heatmap_missing_data2, cmap="YlGnBu")

Pas de cluster donc ça devrait être bon.

Après le total_bedroom, c'est pas un facteur si décisif a priori.

On peut drop les na

In [None]:
data_capped.shape

In [None]:

clean_data = data_capped.dropna()
# on peut utiliser subset['column'] pour ne considérer que certaines catégories
# how= any ou all si on veut delete tous les rows (axis=0, par défaut)
# inplace=True pour ne pas créer une copie mais changer la variable considérée directement
#ici on sait que seul total_bedroom a des na donc inutile de préciser le subset et le how.

In [None]:
clean_data.shape

## Categorical variable: ocean proximity (encode it)


In [None]:
clean_data.columns

In [None]:
clean_data['ocean_proximity'].value_counts()

<1H OCEAN
INLAND       
NEAR OCEAN   
NEAR BAY     
ISLAND

- those are nominal categorical vraiables: not ordinal because there is no logical relationship between them or order.
ex: if INLAND = 1, NEAR OCEAN =2 and NEAR BAY =3
the model will assume that INLAND < NEAR OCEAN < NEAR BAY, or that  INLAND + NEAR OCEAN = NEAR BAY. does not apply.
- One hot encoding: create a new column for each categories and put 1 or 0 in it, wether it applies to the category or not.



In [None]:
clean_data.ocean_proximity

In [None]:
dummies = pd.get_dummies(clean_data.ocean_proximity)
dummies.head()

concatenate the two dataframes : clean_data with dummies

In [None]:
merged = pd.concat([clean_data, dummies], axis='columns')
merged

+ Drop the column demi-variable trap in order not to have multicollinearity betweeen OCEAN_Proximity and the dummies columns.
It could mess up the regression because regression requires with indepedent variables only.

+ need to drop one of the demi-variable among the dummie columns, here we choose ISLAND


In [None]:
final = merged.drop(['ocean_proximity', 'ISLAND'], axis='columns')
final

# Feature Engineering

- Polynomial features
- Divide 'total_rooms', 'total_bedrooms' by 'households'
- Find more!! (look at the kernels..)

In [None]:
final['rooms_p_household'] = final['total_rooms'] / final['households']
final.drop(['total_rooms'], axis='columns', inplace=True)
final

In [None]:
final['bedrooms_p_household'] = final['total_bedrooms'] / final['households']
final.drop(['total_bedrooms'], axis='columns', inplace=True)
final

# Data Splitting 

On répartit le dataframe en training et validation data

In [None]:
from sklearn.model_selection import train_test_split

housing_X = final.drop("median_house_value",axis=1)#reste un dataframe
housing_y = final['median_house_value']#transformation en series

X_train, X_test, y_train, y_test = train_test_split(housing_X, housing_y, test_size=0.25, random_state=42)

In [None]:
print(type(housing_X))
print(housing_X.columns)
print()
print(type(housing_y))

## Data Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# scaling = MinMaxScaler()
scaling = StandardScaler()
# scaled_data = scaling.fit_transform(train)

In [None]:
X_train = scaling.fit_transform(X_train)
X_test= scaling.fit_transform(X_test)

In [None]:
print(X_train)
print(X_test)

In [None]:
print("X_train shape {} and size {} and type {}".format(X_train.shape,X_train.size, type(X_train)))
print("X_test shape {} and size {} and type {}".format(X_test.shape,X_test.size, type(X_test)))
print("y_train shape {} and size {} and type {}".format(y_train.shape,y_train.size, type(y_train)))
print("y_test shape {} and size {} and type {}".format(y_test.shape,y_test.size, type(y_test)))

# Training a Model: Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [None]:
_ = model.fit(X_train, y_train)
print("Intercept is "+str(model.intercept_))
print("coefficients  is "+str(model.coef_))

# Predict

In [None]:
y_predictions = model.predict(X_test)

In [None]:
print(y_test[0:5])
print("y_test shape {} and size {} and type {}".format(y_test.shape,y_test.size, type(y_test)))
print()
print(y_predictions[0:5])
print("y_predictions shape {} and size {} and type {}".format(y_predictions.shape,y_predictions.size, type(y_predictions)))

### Comparaison entre actual et predicted 

- création d'un dataframe qui est la somme d'une Series panda et d'un numpu.ndarray de dimension 1

In [None]:
comparaison = pd.DataFrame({'Predicted':y_predictions,'Actual':y_test})

#reset l'index qui était hérité du pandas.serise
comparaison.reset_index(inplace=True)
d
#on enleve l'index pour pouvoir mieux comparer
comparaison = comparaison.drop(['index'], axis=1)

comparaison.head()

In [None]:
plt.figure(figsize=(15,20))
plt.plot(comparaison[:30])
plt.legend(['Predicted', 'Actual'])

In [None]:
sns.jointplot(x='Predicted',y='Actual',data=comparaison[:500], kind='reg')

In [None]:
sns.distplot(comparaison['Predicted'], color="r")
sns.distplot(comparaison['Actual'], color="b")

# Model Scoring 

In [None]:
from sklearn.metrics import mean_squared_error

print("MSE= ", mean_squared_error(y_test, y_predictions))
print("RMSE= ", np.sqrt(mean_squared_error(y_test, y_predictions)))

In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, y_predictions)

In [None]:
model.score(X_test, y_test)