In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
housing = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
%matplotlib inline
housing.hist(bins = 50 , figsize = (20,15))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train ,test = train_test_split(housing,test_size = 0.2,random_state = 42) 

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train['median_income'].hist()
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind = 'scatter',x = 'longitude',y = 'latitude',color = 'k',alpha = 0.1)
plt.show()

In [None]:
housing.plot(kind = 'scatter',x = 'longitude', y = 'latitude', alpha = 0.4,figsize = (10,10), s = housing['population']/100,label = 'population', c = 'median_house_value',cmap = plt.get_cmap('jet'),colorbar = True)
plt.legend()
plt.show()

In [None]:
cor = housing.corr()

In [None]:
cor['median_house_value']

In [None]:
x = housing['median_income']
y = housing['median_house_value']

In [None]:
x.size,y.size

In [None]:
plt.scatter(x,y)
plt.show()

In [None]:
housing['rooms_per_households'] = housing['total_rooms']/housing['households']
housing['population_per_households'] = housing['population']/housing['households']   
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms'] 

In [None]:
cor = housing.corr()

In [None]:
cor['median_house_value']

In [None]:
housing.head()

In [None]:
housing = train.drop('median_house_value',axis = 1)
housing_labels = train['median_house_value'].copy()

In [None]:
housing.info()

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test_housing = test.drop('median_house_value',axis = 1)
test_labels = test['median_house_value'].copy() 

In [None]:
type(test_housing),type(test_labels),type(housing)

In [None]:
housing_labels.shape,test_labels.shape

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy = 'median')

In [None]:
housing_num = housing.drop('ocean_proximity',axis = 1)
test_housing = test_housing.drop('ocean_proximity',axis = 1)

In [None]:
imputer.fit(housing_num)
imputer.fit(test_housing)

In [None]:
imputer.statistics_

In [None]:
x = imputer.transform(test_housing)

In [None]:
X = imputer.transform(housing_num)
test_housing = imputer.transform(test_housing)

In [None]:
housing_transformed = pd.DataFrame(X,columns = housing_num.columns)
test_housing = pd.DataFrame(x,columns = housing_num.columns)

In [None]:
housing_transformed.info()

In [None]:
test_housing.info()

In [None]:
housing_cat = housing["ocean_proximity"]
test_cat = test['ocean_proximity']

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_catencoded = encoder.fit_transform(housing_cat)
housing_catencoded

In [None]:
test_catencoded = encoder.fit_transform(test_cat)

In [None]:
housing_catencoded.shape,test_catencoded.shape

In [None]:
test_catencoded

In [None]:
from sklearn.preprocessing import StandardScaler
stdscl = StandardScaler()
housing_prepared = stdscl.fit_transform(housing_transformed)
test_prepared = stdscl.fit_transform(test_housing)

In [None]:
housing_prepared = pd.DataFrame(housing_prepared,columns = housing_transformed.columns)
test_prepared = pd.DataFrame(test_prepared,columns = housing_transformed.columns)

In [None]:
housing_prepared['ocean_proximity'] = housing_catencoded

In [None]:
housing_prepared.head()

In [None]:
test_prepared['ocean_proximity'] = test_catencoded

In [None]:
test_prepared.info()

In [None]:
from sklearn.linear_model import LinearRegression
linr = LinearRegression()

In [None]:
 linr.fit(housing_prepared,housing_labels)

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = linr.predict(housing_prepared)
err = mean_squared_error(housing_labels,housing_predictions)

In [None]:
lin_rmse = np.sqrt(err)
lin_rmse

In [None]:
from sklearn.metrics import mean_absolute_error
abserr = mean_absolute_error(housing_labels,housing_predictions)
abserr

In [None]:
from sklearn.tree import DecisionTreeRegressor
dcs = DecisionTreeRegressor()

In [None]:
dcs.fit(housing_prepared,housing_labels)

In [None]:
housing_predictions = dcs.predict(housing_prepared)

In [None]:
from sklearn.metrics import mean_squared_error
dcs_reg = mean_squared_error(housing_labels,housing_predictions)

In [None]:
dcs_reg = np.sqrt(dcs_reg)

In [None]:
dcs_reg

In [None]:
from sklearn.model_selection import cross_val_score
dcscore = cross_val_score(dcs,housing_prepared,housing_labels,cv =10,scoring = 'neg_mean_squared_error')

In [None]:
rmse_Scores = np.sqrt(-dcscore)

In [None]:
rmse_Scores

In [None]:
print(rmse_Scores.mean())
print(rmse_Scores.std())

In [None]:
lin_scores = cross_val_score(linr,housing_prepared,housing_labels,cv = 10, scoring = 'neg_mean_squared_error')

In [None]:
lin_scores = np.sqrt(-lin_scores)

In [None]:
print(lin_scores)
print(lin_scores.mean())
print(lin_scores.std())

In [None]:
from sklearn.ensemble import RandomForestRegressor
rndmfrst = RandomForestRegressor()

In [None]:
rndmfrst.fit(housing_prepared,housing_labels)

In [None]:
housing_predictions = rndmfrst.predict(housing_prepared)

In [None]:
housing_predictions

In [None]:
rndmerr = mean_squared_error(housing_labels,housing_predictions)

In [None]:
rndmerr = np.sqrt(rndmerr)

In [None]:
print(rndmerr)
print(rndmerr.mean())
print(rndmerr.std())

In [None]:
rndmscores = cross_val_score(rndmfrst,housing_prepared,housing_labels,cv = 10,scoring = 'neg_mean_squared_error')

In [None]:
rndmscores = np.sqrt(-rndmscores)
print(rndmscores.mean())
print(rndmscores)
print(rndmscores.std())

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}]
grid_search = GridSearchCV(rndmfrst,param_grid,cv = 5,scoring = 'neg_mean_squared_error')

In [None]:
grid_search.fit(housing_prepared,housing_labels)

In [None]:
grid_search.best_params_

In [None]:
print(np.sqrt(-grid_search.best_score_))

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_

In [None]:
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)


In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [None]:
feature_importances

In [None]:
attribs = list(housing_prepared.columns)

In [None]:
 sorted(zip(feature_importances, attribs), reverse=True)

In [None]:
final_model = grid_search.best_estimator_

In [None]:
final_predictions = final_model.predict(test_prepared) 

In [None]:
final_mse = mean_squared_error(test_labels,final_predictions)

In [None]:
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse