### Data Description

####Context
This is the dataset used in the second chapter of Aurélien Géron's recent book 'Hands-On Machine learning with Scikit-Learn and TensorFlow'. It serves as an excellent introduction to implementing machine learning algorithms because it requires rudimentary data cleaning, has an easily understandable list of variables and sits at an optimal size between being to toyish and too cumbersome.

The data contains information from the 1990 California census. So although it may not help you with predicting current housing prices like the Zillow Zestimate dataset, it does provide an accessible introductory dataset for teaching people about the basics of machine learning.


#### Detail
1. longitude: A measure of how far west a house is; a higher value is farther west

2. latitude: A measure of how far north a house is; a higher value is farther north

3. housingMedianAge: Median age of a house within a block; a lower number is a newer building

4. totalRooms: Total number of rooms within a block

5. totalBedrooms: Total number of bedrooms within a block

6. population: Total number of people residing within a block

7. households: Total number of households, a group of people residing within a home unit, for a block

8. medianIncome: Median income for households within a block of houses (measured in tens of thousands of US Dollars)

9. medianHouseValue: Median house value for households within a block (measured in US Dollars)

10. oceanProximity: Location of the house w.r.t ocean/sea

In [None]:
# from google.colab import files
# upload = files.upload()

### imprt libabrires

In [None]:
# !pip install category_encoders==2.*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from category_encoders import OneHotEncoder


### feature Engineering 

In [None]:
def wrangle(filepath):
  df = pd.read_csv(filepath)
  df.dropna(inplace=True)


  # add the average of rooms in a house
  df['ave_rooms'] = df['total_rooms'] / df['households']
  # add the average bedrooms
  df['ave_bedrooms'] = df['total_bedrooms']/ df['households']
  # add the population per houshold
  df['pop_per_house'] = df['population'] / df['households']

 
  return df

filename='../input/california-housing-prices/housing.csv'

In [None]:
df=wrangle(filename)

df.head()

In [None]:

df.info()

In [None]:
df.describe()

### Data Visualization

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(x='latitude', y='longitude',c='population' ,cmap='Spectral', data = df)
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Population Magnitude')
plt.colorbar().set_label('Population')
plt.show()

In [None]:
median_hous_mean = df['median_house_value'].mean()
median_hous_median = df['median_house_value'].median()
plt.figure(figsize=(8,8))
sns.distplot(df['median_house_value'],bins=60)
plt.axvline(x=df['median_house_value'].mean(),color='green',label=f'mean: {round(median_hous_mean,2)}')
plt.axvline(x=df['median_house_value'].median(),color='red',label=f'median: {round(median_hous_median,2)}')
plt.xlabel('Median House Values')
plt.ylabel('Number of Houses')
plt.title('Median House Values for a block')
plt.legend()
plt.show();

In [None]:
houshold_mean = df['households'].mean()
houshold_median = df['households'].median()
plt.figure(figsize=(10,10))
sns.distplot(df['households'],bins=100)
plt.axvline(x=df['households'].mean(),color='green',label=f'mean: {round(houshold_mean,2)}')
plt.axvline(x=df['households'].median(),color='red',label=f'median: {round(houshold_median,2)}')
plt.xlabel('Total number of households in a block')
plt.ylabel('Number of Houses')
plt.legend()
plt.xlim(-100, 2500)
plt.show()

In [None]:
houshold_mean = df['ave_rooms'].mean()
houshold_median = df['ave_rooms'].median()
plt.figure(figsize=(10,10))
sns.distplot(df['ave_rooms'], bins=100)
plt.axvline(x=df['ave_rooms'].mean(),color='green',label=f'mean: {round(houshold_mean,2)}')
plt.axvline(x=df['ave_rooms'].median(),color='red',label=f'median: {round(houshold_median,2)}')
plt.xlabel('Total number of rooms in a house')
plt.ylabel('Number of Houses')
plt.title('number of rooms in the housis in a block')
plt.legend()
plt.xlim(0, 13)
plt.show()

the average number of bedrooms in a block in california is 537.87

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(x='latitude', y='longitude',c='median_income' ,cmap='Spectral', data = df)
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('median income per block')
plt.show()

fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(15,8))
# median income less than 4
mask1 = df['median_income'] <=4
x=df.loc[mask1]
ax1.scatter(x='latitude', y='longitude',c='median_income' ,cmap='Spectral', data = x)
ax1.set_xlabel('Latitude')
ax1.set_ylabel('Longitude')
ax1.set_title('median income per block less than 4')

# median income more than 4 and less than 8
x2=df.loc[(df['median_income'] >=4) & (df['median_income']<8)]
ax2.scatter(x='latitude', y='longitude',c='median_income' ,cmap='Spectral', data = x2)
ax2.set_xlabel('Latitude')
ax2.set_ylabel('Longitude')
ax2.set_title('median income per block between 4 and 8')


# median income more than 8
mask3 = df['median_income'] >=8
x3=df.loc[mask3]
ax3.scatter(x='latitude', y='longitude',c='median_income' ,cmap='Spectral', data = x3)
ax3.set_xlabel('Latitude')
ax3.set_ylabel('Longitude')
ax3.set_title('median income per block more than 8')


this show us the blocks where the midean income is:
1. less than 4
2. between 4 and 8
3. more than 8

In [None]:
x= df['ocean_proximity'].value_counts()
plt.figure(figsize=(8,8))
plt.bar(x.index, x , width=0.6)
plt.xlabel('ocean_proximity')
plt.ylabel('housholds')

for index,data in enumerate(x):
    plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=15))
plt.tight_layout()

plt.show()

In [None]:
# housing_median_age

houshold_mean = df['housing_median_age'].mean()
houshold_median = df['housing_median_age'].median()
plt.figure(figsize=(10,10))
sns.distplot(df['housing_median_age'],bins=40)
plt.axvline(x=df['housing_median_age'].mean(),color='green',label=f'mean: {round(houshold_mean,2)}')
plt.axvline(x=df['housing_median_age'].median(),color='red',label=f'median: {round(houshold_median,2)}')
plt.xlabel('Age')
plt.ylabel('Number of Houses')
plt.title('the houses ages')
plt.legend()
plt.show()

the average age of the houses is 28.63 year old

using heatmap to find the corrolation between the features

In [None]:
df_corr = df.corr()
mask = np.triu(np.ones_like(df_corr, dtype=np.bool))
plt.figure(figsize=(15,10))
sns.heatmap(df_corr ,mask=mask, linewidth =0.5,vmin=-1, vmax=1, annot=True, cmap = 'coolwarm' )
plt.show()

### Split Data

In [None]:
X = df.drop(columns=['median_house_value'])
y = df['median_house_value']

print(f'X shape {X.shape}\ny shape  {y.shape}')

split data to train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)

print(f'X_train shape {X_train.shape} ----> y_train shape  {y_train.shape} \nX_test shape {X_test.shape} ----> y_test shape  {y_test.shape}')

### BaseLine

In [None]:
baseLine = [y_train.mean()]*y_train.shape[0]

baseline_mae= mean_absolute_error(y_train, baseLine)

In [None]:
print("the baseline MAE is ", round(baseline_mae,2))

### Model

In [None]:
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    LinearRegression()
)

model.fit(X_train, y_train)

## checking the model

we use the mean absolute error and the $R^2$ to check the model for train set and test set

In [None]:
## train MAE 
train_set_MAE= mean_absolute_error(y_train, model.predict(X_train))
print(' the MAE for the train set is ', train_set_MAE)

## test MAE
test_set_MAE = mean_absolute_error(y_test, model.predict(X_test))
print(' the MAE for the test set is ', test_set_MAE)

In [None]:
## train R^2 or accuracy 
train_set_r2 = model.score(X_train, y_train)
print(' the R^2 or accuracy for the train set is ', round(train_set_r2*100,2),'%')

## test R^2
test_set_r2 = model.score(X_test, y_test)
print(' the R^2 or accuracy for the test set is ', round(test_set_r2*100,2),'%')

### Communicate Results

Creating a horizontal barchart that plots the 4 most important coefficients for model, sorted by absolute value.

In [None]:
model_coeff = pd.DataFrame(model.named_steps['linearregression'].coef_, index= model.named_steps['onehotencoder'].get_feature_names(), columns =['coefficients_val'])

model_coeff.sort_values(by='coefficients_val', ascending=False).head(4).plot(kind='barh')

as we see from the chart the median_income is the most important coeddicient.

In [None]:
sns.lmplot(x='median_income' , y= 'median_house_value', data=df, line_kws={'color':'red'}, ci=None)

In [None]:
##the correlation
df['median_income'].corr(df['median_house_value'])

### imporoving the model

In [None]:
df['median_house_value'].skew()

In [None]:
df['log_median_house_value']= np.log(df['median_house_value'])

we plot the log values to see if ther is an imporovement

In [None]:
sns.lmplot(x='median_income' , y= 'log_median_house_value', data=df, line_kws={'color':'red'}, ci=None)

In [None]:
df['log_median_house_value'].corr(df['median_income'])
# correlation = column_1. corr(column_2)

we can see that the correlation of the decreased after using the log function but we can also see that the point in the plot become very close to each other. then we need to fir the mode with the new value of the y and see if the accurase increased

In [None]:
y_log = df['log_median_house_value']
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X,y_log,test_size=0.2,random_state = 42)

print(f'X_train shape {X_train_l.shape} ----> y_train shape  {y_train_l.shape} \nX_test shape {X_test_l.shape} ----> y_test shape  {y_test_l.shape}')

In [None]:
model.fit(X_train_l,y_train_l)

checking the new model

In [None]:
## train MAE 
train_set_MAE= mean_absolute_error(y_train_l, model.predict(X_train_l))
print(' the MAE for the train set is ', train_set_MAE)

## test MAE
test_set_MAE = mean_absolute_error(y_test_l, model.predict(X_test_l))
print(' the MAE for the test set is ', test_set_MAE)

In [None]:
## train R^2
train_set_r2 = model.score(X_train_l, y_train_l)
print(' the R^2 for the train set is ', round(train_set_r2*100,2))

## test R^2
test_set_r2 = model.score(X_test_l, y_test_l)
print(' the R^2 for the test set is ', round(test_set_r2*100,2))

## we can see the new model accuracy or R^2 imporoved by 1.72%

