In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this notebook, we will be using pipelines to run one model at a time to compare different models and compare the performance

In [None]:
#reading the data
df= pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')
df.head()

In [None]:
#Types of variables
df.info()

In [None]:
#Descriptive statistics for numerical variables
df.describe()

In [None]:
#Let's check the unique values of the variable Ocean Proximity
df['ocean_proximity'].unique()

In [None]:
#Importing visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Pairplot for all data relationships
sns.pairplot(df, diag_kind='kde')

1. Longitude and Latitude are related to each other. 

2. With increase in housing_median_age, the total number of rooms and total number of bedrooms decreases

3. The total number of bedrooms  is positively correlated with  the total number of rooms and number of households 



In [None]:
#Let's check for null values
df.isnull().sum()

In [None]:
#We will use Iterative Imputer for dealinf with missing values here
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
#Initializing the Iterative Imputer with LinearRegression as the estimator
it=IterativeImputer(estimator=LinearRegression())

# column ocean_proximity does not add any significant value to the target median_house_value hence we will drop this column
demo=df.drop('ocean_proximity',1)


#fitting the data and transforming to fill the null values
data=pd.DataFrame(it.fit_transform(demo))

data.columns=demo.columns

data.head()

In [None]:
#new descriptive statistics after imputation
data.describe()

In [None]:
#confirming imputation
data.isnull().sum()

In [None]:
#Checking the strength of relationships with heatmap
sns.heatmap(data.corr(), annot=True)

In [None]:
#Importing libraries for various models to use in Pipelines
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA, KernelPCA
from xgboost import XGBRegressor



In [None]:
#Creating test and train datasets

X= data.drop('median_house_value',1)
y=data['median_house_value']

X_train, X_test, y_train, y_test=  train_test_split(X, y, test_size=.33, random_state=0 )

In our first pipeline we will use a Power Transformer for outlier treatment and scaling followed by a Linear Regression Model

In [None]:

pipe= Pipeline([('pt',PowerTransformer()), ('lr', LinearRegression())])
pipe.fit(X_train, y_train)
print('Testing Accuracy', pipe.score(X_test, y_test))
print('Training Accuracy', pipe.score(X_train, y_train))

Now we will use a Power Transformer for outlier treatment and scaling followed by a XGBoostRegressor

In [None]:

pipe= Pipeline([('pt',PowerTransformer()), ('lr', XGBRegressor(n_estimators=200))])
pipe.fit(X_train, y_train)
print('Testing Accuracy', pipe.score(X_test, y_test))
print('Training Accuracy', pipe.score(X_train, y_train))

To get the maximum information with least noise we will use a Power Transformer , apply PCA on the data followed by a XGBoostRegressor

In [None]:
pipe= Pipeline([('pt',PowerTransformer()),   ('pca', PCA(n_components=4))    , ('lr', XGBRegressor(n_estimators=100))])
pipe.fit(X_train, y_train)
print('Testing Accuracy', pipe.score(X_test, y_test))
print('Training Accuracy', pipe.score(X_train, y_train))

We will now compare the results with KernelPCA vs PCA(as done on previous pipeline) on the data followed by a XGBoostRegressor

In [None]:
pipe= Pipeline([('pt',PowerTransformer()),   ('pca', KernelPCA())    , ('lr', XGBRegressor(n_estimators=200))])
pipe.fit(X_train, y_train)
print('Testing Accuracy', pipe.score(X_test, y_test))
print('Training Accuracy', pipe.score(X_train, y_train))

As we can see using Kernel PCA on this dataset giving better accuracy, which can be pulled up more.

We will now compare the results with an increased estimators for  XGBoostRegressor

In [None]:
pipe= Pipeline([('pt',PowerTransformer()),   ('pa', KernelPCA())    , ('lr', XGBRegressor(n_estimators=500))])
pipe.fit(X_train, y_train)
print('Testing Accuracy', pipe.score(X_test, y_test))
print('Training Accuracy', pipe.score(X_train, y_train))

As we can see, the process of using pipeplines to build a model is much easier and cleaner than general standard code