In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
df

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
df.info()

In [None]:
df.isnull().sum()

### Total no of bedrooms has null values, we will fill them using mean

### We wont be doing spatial analysis so we will remove the latitude and longitude columns

In [None]:
df.drop(['longitude','latitude'],axis=1,inplace =True)

In [None]:
df.info()

In [None]:
df['total_bedrooms'].fillna(df['total_bedrooms'].mean(),inplace=True)

In [None]:
df.info()

### We have filled in the null values

In [None]:
df.head()

# EDA

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(),annot=True);

In [None]:
df.hist(figsize=(20,15));

### We can see that only median house age is normally distributed rest all are right skewed

In [None]:
sns.scatterplot(data=df,x='median_income',y='median_house_value')
plt.title('Median Income vs Median House value');

#### Above plot tells us that there is a positive relation b/w the median income and median house value telling us that more the income of the houseolds , they buy expensive property

In [None]:
sns.boxplot(data=df,y='median_house_value'); #let's see the box plot to see the mean house price 

In [None]:
df['median_house_value'].mean()

In [None]:
sns.scatterplot(data=df,x='total_rooms',y='median_house_value')
plt.title('Total no of rooms vs Median House value');

In [None]:
sns.scatterplot(data=df,x='total_bedrooms',y='median_house_value')
plt.title('Total no of rooms vs Median House value');

In [None]:
# Let's see the distribution of of the Median house age
sns.displot(data=df,x='housing_median_age',kde=True);

In [None]:
sns.countplot(data=df,x='ocean_proximity');
df['ocean_proximity'].value_counts()

In [None]:
house_near_onehocean = df[df['ocean_proximity'] == '<1H OCEAN']
house_near_onehocean.describe()

In [None]:
df.groupby('ocean_proximity').mean()

### Let's seperate the data by ocean proxmity and then see detailed information to gain insight

In [None]:
house_near_inland = df[df['ocean_proximity'] == 'INLAND']
house_near_inland.describe()

In [None]:
house_near_bay = df[df['ocean_proximity'] == 'NEAR BAY']
house_near_bay.describe()

In [None]:
house_near_ocean = df[df['ocean_proximity'] == 'NEAR OCEAN']
house_near_ocean.describe()

In [None]:
house_near_island = df[df['ocean_proximity'] == 'ISLAND']
house_near_island.describe()

#### 1.After seperating the category by the ocean proximity we can see the average housel value and we can tell that houses located near Island are expensive which also justifies because island mostly being a vacation place has higher value of houses. But we cant rely on this data as we have information just 5 houses in this category, so to have clear idea of thus we might need more data.

#### 2.Other than this we can see that houses located at the Bay area are expensive than houses of other areas.

#### 3.So we can say that the location of house from nearby oceans plays vital role in prices of the houses.

#### 4.There is very little positive relation as seen from plots, between the total no of rooms and bedrooms within a block and the price of the houses.

## Feature Selection 

In [None]:
X = df.drop('median_house_value',axis=1)

In [None]:
X = pd.get_dummies(data=X)
X.head()

In [None]:
X['ocean_proximity_<1H_OCEAN'] = X['ocean_proximity_<1H OCEAN']

In [None]:
X = X.drop('ocean_proximity_<1H OCEAN',axis=1)

In [None]:
X.head()

In [None]:
print('Shape of input features',X.shape) # 2D array

In [None]:
y = df['median_house_value']
y.head()

In [None]:
print('Shape of input features',y.shape)

In [None]:
y.values # 1D array

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42 )

## Model Creation

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
house_price_model = LinearRegression()

#### Training model on training data

In [None]:
house_price_model.fit(X_train,y_train)

### Predicting the house values on test data

In [None]:
y_predictions = house_price_model.predict(X_test)
y_predictions

### Now that we have predicted the test data using the model we need to find out the accuracy of this model and find out whether Linear Regerssion was best algo for this data

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
y_test.mean()

In [None]:
#df['median_house_value'].mean()

In [None]:
y_predictions.mean()

In [None]:
mean_absolute_error(y_test,y_predictions)

In [None]:
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

mape(y_test,y_predictions)

#### So we after calculating MAPE we found out that we are off by or we have 28.9% i.e. 29% as error rate

#### So is that error acceptable - it depends on context and consider our scenario - 29 % error is surely not  completely acceptable but it can give us a fair idea about the prices of the houses

In [None]:
np.sqrt(mean_squared_error(y_test,y_predictions))

##### We can that we have a high RMSE than our MAE, whcih suggests that for most of the data the predicted value is having fair accuracy but for few points/data features the predicted house value is havng very high errors

In [None]:
#### Accuracy of the model
r2_score(y_test,y_predictions)

In [None]:
X.columns

So our model accuracy is 63%

In [None]:
house_price_model.coef_

In [None]:
house_price_model.intercept_

#### Now we want to know if the underlying dataset was a valid dataset for Linear Regression , by checking the residual plots if there was a sttrange pattern that we coul not see in some multidimensional level

#### Let's visualize the residual plots to see that 

In [None]:
test_residuals = y_test - y_predictions

In [None]:
test_residuals

In [None]:
sns.scatterplot(x=y_test,y=test_residuals)
plt.axhline(y=0,color='red')
plt.title('Residual Plot');

### We can see that the data residual plot shows us that the points are normally distributed along the line, so we can conclude that the underlying dataset a valid choice for Linear Regression.

### As for the low model accuracy, it might be because of less data and also in model coefficients we have some negative coefficients values which we can remove to get better accuracy.

In [None]:
sns.displot(test_residuals,bins=30,kde=True);

### Even the distribution tells us that the errors/residuals are somewhat distributed normally and also the kde shows us that the mean is pretty close to zero

### Also we can see there is undershoot of a little bit but having a little bit skewed on one way or the other is not too bad.

### Saving the model

In [None]:
from joblib import dump

In [None]:
dump(house_price_model,'House_prediction_model.joblib')