In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the Data

In [None]:
data = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')

# Data Exploration

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data['ocean_proximity'].value_counts()

# Data Visualization

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

california_img = mpimg.imread('../input/california-housing-feature-engineering/california.png')
data.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4, s=data['population']/100, label='population', figsize=(20,14), c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)
plt.show()

## Histograms 

In [None]:
_ = data.hist(bins=50, figsize=(20,15))

In [None]:
data[data['median_house_value'] >= 500001].count()

# Data Preprocessing

## Capped variables 

Delete data with 'median_house_value' > 500001]

In [None]:
data = data[data['median_house_value'] < 500001]

#### Delete 5 ISLAND data points

In [None]:
df = data[data.ocean_proximity != 'ISLAND']
df.reset_index()
df.head()

In [None]:
_ = df.hist(bins=50, figsize=(20,15))

## Categorical variable: ocean proximity (encode it)

Create 4 new columns INLAND,<1H OCEAN, NEAR BAY and NEAR OCEAN

In [None]:
c = ["INLAND", "<1H OCEAN", "NEAR BAY", "NEAR OCEAN"]
df = df.assign(**dict.fromkeys(c, 0))
df.head()

Fill the 4 new columns with 1 or 0 depending on the values of ocean_proximity

In [None]:
for key in ['INLAND', '<1H OCEAN', 'NEAR BAY', 'NEAR OCEAN']:
    df.loc[df["ocean_proximity"] == key, key] = 1
df

Delete 'ocean_proximity' column 

In [None]:
df = df.drop(columns=['ocean_proximity'])
df.head()

Reorder Dataframe columns

In [None]:
col = df.columns.tolist()
print("Current columns names: \n", col)
new_cols = ['INLAND', '<1H OCEAN', 'NEAR BAY', 'NEAR OCEAN', 'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
df = df.reindex(columns=new_cols)
print("Reordered columns names: \n", df.columns.tolist())

## Missing values

#### Get rows with NaN values only

In [None]:
df_NaN = df[pd.isnull(df).any(axis=1)]
print("Number of NaN values:\n", df.isnull().sum(axis = 0))
print(df_NaN.head())

df_NaN.to_csv('df_NaN.csv')
### Note: there 207 missing values in the 'total bedrooms' column

#### Get average and median values of columns:

In [None]:
ave_total_rooms = df["total_rooms"].mean()
ave_total_bedrooms = df["total_bedrooms"].mean()
ave_median_income = df["median_income"].mean()

med_total_rooms = df["total_rooms"].median()
med_total_bedrooms = df["total_bedrooms"].median()
med_median_income = df["median_income"].median()

#### Find NaN values locations in 'total bedrooms'column and replace them with:
    ave_total_bedrooms / ave_total_rooms * total_rooms
    or
    med_total_bedrooms / med_total_rooms * total_rooms

# Case 1: Replace with average values calculation
df_digit_filled = df_digit["total_bedrooms"].fillna(ave_total_bedrooms / ave_total_rooms * df_digit["total_rooms"])
print("Number of NaN values after filling:", df_digit_filled.isnull().sum(axis = 0))
df_digit_filled.to_csv('df_digit_filled.csv')

In [None]:
# Case 2: Replace with median values caluclation
df["total_bedrooms"] = df["total_bedrooms"].fillna(med_total_bedrooms / med_total_rooms * df["total_rooms"])
print("Number of NaN values after filling:\n", df.isnull().sum(axis = 0))
df.to_csv('df_filled.csv')

In [None]:
df.head()

In [None]:
# clean_data = data.dropna()

In [None]:
# clean_data.head()

In [None]:
# clean_data.drop('ocean_proximity', axis='columns', inplace=True)

In [None]:
# clean_data.head()

# Feature Engineering

- Polynomial features
- Divide 'total_rooms', 'total_bedrooms' by 'households'
- Find more!! (look at the kernels..)

### Divide 'total_rooms', 'total_bedrooms' by 'households'

In [None]:
df["total_bedrooms"] = df["total_bedrooms"] / df['households']
df["total_rooms"] = df["total_rooms"] / df['households']
df.head()

## Imbalanced data

In [None]:
from imblearn.over_sampling import SMOTE


## Data Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(df)
df_origin = df
df.loc[:,:] = scaled_values
df.to_csv('df_scaled.csv')
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler

df = df_origin
scaler = StandardScaler() 
scaled_df = scaler.fit_transform(df) 
df.loc[:,:] = scaled_values
df.head()

## Data Splitting 

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.25, random_state=42)

In [None]:
train.info(), test.info()

# Training a Model 

In [None]:
col = test.columns.tolist()
col

In [None]:
features = col[: -1]
target = col[-1]
features

In [None]:
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [None]:
_ = model.fit(X_train, y_train)

# Predict

In [None]:
predictions = model.predict(X_test)

# Model Scoring 

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, predictions)

In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, predictions)

In [None]:
model.score(X_test, y_test)