In [None]:
### Les Modules de Travail
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#from pandas.plotting import scatter_matrix
import seaborn as sns

Importation  dataset

In [None]:
Dataset=pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
Dataset.head()

In [None]:
Dataset.info()

In [None]:
df=Dataset.copy()

In [None]:
print(df.columns)

In [None]:
print(df.shape)

In [None]:
print(df.dtypes.value_counts())
df.dtypes.value_counts().plot.pie()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.isna(), cbar=False)

### Analyse de la variable categorielle

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
df['ocean_proximity'].value_counts(normalize=True)

In [None]:
#Conversion de la variable qualitative en categories
df=df.astype({'ocean_proximity':'category'})

In [None]:
print(df.dtypes.value_counts())
df.dtypes.value_counts().plot.pie()

### Feacture engineering

In [None]:
df['rooms_per_household']=df['total_rooms']/df['households']

In [None]:
df['bedrooms_per_household']=df['total_bedrooms']/df['households']

In [None]:
df['population_per_household']=df['population']/df['households']

In [None]:
df.head()

In [None]:
df=df.drop(['longitude','latitude' ], axis=1)  

In [None]:
df.describe()

In [None]:
for col in df.select_dtypes('float'):
    #print(col)
    plt.figure()
    sns.distplot(df[col])

In [None]:
df['ocean_proximity'].value_counts().plot.pie()

In [None]:
df['ocean_proximity'].value_counts().plot(kind='bar')

#### Etude de variation et correlation

In [None]:
sns.pairplot(df.select_dtypes('float'))

In [None]:
sns.heatmap(df.select_dtypes('float').corr())

In [None]:
corr = df.select_dtypes('float').corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1, annot=True, annot_kws={"size": 8}, square=True);

In [None]:
sns.clustermap(df.select_dtypes('float').corr())

### TARGET

In [None]:
df.describe()['median_house_value']

In [None]:
df['median_house_value'].hist(bins= 50)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=df,x='ocean_proximity',y='median_house_value',palette='viridis')
plt.plot()

In [None]:
plt.figure(figsize=(10,6))

sns.stripplot(data=df,x='ocean_proximity',y='median_house_value',jitter=0.3)

## TrainTest - Nettoyage - Encodage

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
trainset, testset = train_test_split(df, test_size=0.1, random_state=42)

In [None]:
X_train=trainset.drop(['median_house_value'],axis=1)


In [None]:
y_train=trainset['median_house_value']


In [None]:
print(trainset.shape)

In [None]:
X_test=testset.drop(['median_house_value'],axis=1)


In [None]:
y_test=testset['median_house_value']


In [None]:
print(testset.shape)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_test

In [None]:
y_train

### Pipeline  de transformation

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,PolynomialFeatures
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
X_train.columns

In [None]:
print(X_test.dtypes.value_counts())
X_test.dtypes.value_counts().plot.pie()

In [None]:
numerical_features = list(X_train.select_dtypes(include=['float64']))
categorical_features = list(X_train.select_dtypes(include=['category']))

In [None]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder())])

In [None]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_features),('cat', categorical_transformer, categorical_features)])

## Modellisation

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [None]:
model = Pipeline(steps=[('preprocessor', preprocessor),('LinearRegression', LinearRegression())])
model.fit(X_train, y_train)
model.score(X_test, y_test)
print(model.score(X_test, y_test))
y_pred= model.predict(X_test)
y_pred= y_pred.reshape(-1,1)

print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
TreeDecision =Pipeline(steps=[('preprocessor', preprocessor),('Decision', DecisionTreeRegressor(random_state=0))])
RandomForest = Pipeline(steps=[('preprocessor', preprocessor),('RandomForest', RandomForestRegressor(n_estimators=10,random_state=0))]) 
SVCmodel = Pipeline(steps=[('preprocessor', preprocessor),('SVR', SVR(kernel='rbf'))])
GradientRegressor =Pipeline(steps=[('preprocessor', preprocessor),('GradientRegressor', GradientBoostingRegressor(random_state=0))])
KNN = Pipeline(steps=[('preprocessor', preprocessor),('KNN', KNeighborsRegressor(n_neighbors=5))])
#SGDRegressor = Pipeline(steps=[('preprocessor', preprocessor),('SGDRegressor', SGDRegressor())])

In [None]:
dict_of_models = {'TreeDecision' : TreeDecision,
                  'RandomForest':RandomForest,
                  'SVCmodel':SVCmodel,
                  'GradientRegressor':GradientRegressor,
                  'KNN':KNN,
                  #'SGDRegressor':SGDRegressor
                  }

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [None]:
def evaluation(model):
    model.fit(X_train, y_train)#apprentissage des données
    model.score(X_test, y_test) 
    print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
    y_pred= model.predict(X_test) #prediction des valeurs
    y_pred= y_pred.reshape(-1,1)
    
    print('SCORE:',model.score(X_test, y_test))
    print('MAE:', mean_absolute_error(y_test, y_pred))
    print('MSE:', mean_squared_error(y_test, y_pred))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
    print("\n")
    
    plt.figure(figsize=(15,8))
    plt.scatter(X_test.iloc[:,1],y_test)
    plt.scatter(X_test.iloc[:,1],y_pred,c='r')
    plt.show()

In [None]:
for name, model in dict_of_models.items():
    print(name)
    evaluation(model)

### prediction des prix

In [None]:
param_grid = [{'n_estimators':[3,10,30],'max_features':[2,4,6,8],'max_depth':[6,8,10]}]

In [None]:
model_final=RandomForestRegressor(random_state=42, n_jobs=-1)

In [None]:
grid_search = GridSearchCV(model_final, param_grid, cv=5,scoring='neg_mean_squared_error')#initialisation
grid_search

In [None]:
X_train_prepared=preprocessor.fit_transform(X_train)

In [None]:
print(X_train_prepared.shape)
print(X_train_prepared[0])

In [None]:
grid_search.fit(X_train_prepared, y_train) #apprentissage du modèle

#### Modele final

In [None]:
RandomForest = Pipeline(steps=[('preprocessor', preprocessor),('RandomForest', RandomForestRegressor(n_estimators=30,random_state=42,max_depth=10,max_features=8))])

In [None]:
RandomForest

In [None]:
RandomForest.fit(X_train, y_train)
RandomForest.score(X_test, y_test)
print(RandomForest.score(X_test, y_test))
y_pred= RandomForest.predict(X_test)
y_pred= y_pred.reshape(-1,1)

print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
evaluation(RandomForest)

In [None]:
encoder = OneHotEncoder()

In [None]:
encoder.fit_transform(X_train[categorical_features])

In [None]:
encoded_name=encoder.get_feature_names()
encoded_name

In [None]:
list_features=np.append(numerical_features,encoded_name)
list_features=list(list_features)
list_features

In [None]:
df_features=pd.DataFrame(data=list_features,columns=['feature'])
df_features

In [None]:
df_features['importance']=RandomForest.steps[1][1].feature_importances_

In [None]:
df_features