# 1.Importing required modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# 2.Loading the data

In [None]:
data=pd.read_csv('../input/california-housing-prices/housing.csv')
data.head()

# Describing the loaded data

In [None]:
data.info()

In [None]:
data.describe()

# we will find out if their is any null values present in the data set usind isnull().sum()


In [None]:
sns.set_style('whitegrid')
sns.rugplot(data.isnull().sum())

## from the above rugplot we can observe that there are no null values

In [None]:
data.isnull().sum()

In [None]:
data['total_bedrooms'].value_counts()

# only 207 data vaues are missing out of 20640 so we can fill those missing with some other appropriate values
- By looking at the above dataset we can conclude that there would be some relationship between total_rooms and total_bedrooms

In [None]:
data.head()

##  using boxplot we will represent relations between total_rooms and total_bedrooms but it is still very hard to understand as there are many values so graphical representation won't be very clear

In [None]:
sns.barplot(data=data,x='total_bedrooms',y='total_rooms',hue='ocean_proximity')

## By observing the dataset if we able to find out the ratio of total_rooms to total_bedrooms there is some relation so we will find the ratio of each value betweem total_bedrooms and total_rooms and then we store that value onto a variable

In [None]:
divide=data['total_bedrooms']/data['total_rooms']
divide

## by taking the mean of all the values of the obtained calculation we will fill the null/NaN values with mean value using fillna()

In [None]:
mean_of_bedroom_tototal_room=divide.mean()
mean_of_bedroom_tototal_room

## if we observe from the given data total number of bed_rooms would be 20% to 30% of total_rooms so we will find out the ratio of those data and will find mean of the data and will fill null values with the corresponding mean value 

In [None]:
data['total_bedrooms'].fillna(mean_of_bedroom_tototal_room*data['total_rooms'],inplace=True)
data.head()

# using heatmap we will represent the null values present in the graphical mode by looking at that heatmap we will find that there are no null values present in dataset

In [None]:
sns.heatmap(data.isnull())

### By doing all the above operations we will finally results in no null values

In [None]:
data.isnull().sum()

# using histogram we will plot histogram representation to each and every value 

In [None]:
data.hist(bins=75,figsize=(16,14))

## finding correlation of the data

In [None]:
data.corr()

# using heatmap we will reprsent correlation of the dataset provided

In [None]:
sns.heatmap(data.corr())

# using jointplot we will represent regression graph

In [None]:
sns.jointplot(data=data,y='total_rooms',x='total_bedrooms',kind='reg')

In [None]:
sns.jointplot(data=data,y='total_rooms',x='total_bedrooms',kind='kde')

In [None]:
sns.pairplot(data)

In [None]:
data.head()

## we could predict the values of median_house_value by median_income using LinearRegression

In [None]:
linreg=LinearRegression()
linreg

In [None]:
data.head(),data.shape

### taking the value of median_income,median_house_value

In [None]:
x=data.iloc[:,7:8].values
y=data.iloc[:,8].values

In [None]:
x.shape,y.shape

## using train_test_split we will give 80% for training data and 20% for testing data

In [None]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)


#### Using fit function we will make the machine train the data

In [None]:
linreg.fit(X_train,y_train)

In [None]:
y_pred=linreg.predict(X_test)
y_pred

In [None]:
linreg.predict([[9]])

In [None]:
linreg.score(X_test,y_test)

### from the above we can observe that the accuracy of the solution is around 47.6% which is not so great so LinearRegression might not be that suitable for the system

In [None]:
plt.plot(X_train,y_train,color='r')
plt.scatter(X_train,linreg.predict(X_train),color='g')
plt.xlabel('X_train')
plt.ylabel('y_train')
plt.title('X_train Vs y_train')
plt.show()

## from the above plot and scatter using matplotlib representation is not so clear so will use plotting by plotly

In [None]:
plt.plot(X_train[:10],y_train[:10],color='r')
plt.scatter(X_train[:10],linreg.predict(X_train)[:10],color='g')
plt.xlabel('X_train')
plt.ylabel('y_train')
plt.title('X_train Vs y_train')
plt.show()

In [None]:
plt.scatter(X_train[:10],y_train[:10],color='r')

## from the following representation we could easily find the best fit line and can easily represent LinearRegression

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(X_test, y_test,  color='#f57e42')
plt.plot(X_test, y_pred, color='black', linewidth=2)
plt.show()

In [None]:
plt.figure(figsize=(20,20))
fig=px.scatter(X_test, y_test,trendline='ols')
fig.show()

In [None]:
data.hist(figsize=(20,15))