In [5]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from scipy.stats import randint
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
# reading
df = pd.read_csv("housing_prices.csv")

In [6]:
df = df.set_index('Id')
df.isna().sum()


MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64

In [7]:
y = df['SalePrice'].copy()
X = df.drop(columns=['SalePrice']).copy()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=1230000)


### preprocessing pipeline
Preprocessing pipelines: Those pipelines only transform the predictor features (the X) by filling NAs, encoding categorical features, scaling, etc. You always have to fit them with X_train. Then, you can call the .transform() method to transform both the X_train and the X_test. (Sometimes, you fit and transform X_train in a single step, by using the .fit_transform() method, but you're still performing these 2 separate steps). Any time that you call transform() you get as an output the transformed data, X_train or X_test.

In [9]:
X_num_col = X_train.select_dtypes(include="number").copy().columns
X_cat_col = X_train.select_dtypes(exclude="number").copy().columns

In [10]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="median"),
    MinMaxScaler())
    #StandardScaler())

cat_pipe = make_pipeline(
    #SimpleImputer(strategy="constant", fill_value='NaN'),
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(drop="first", handle_unknown="ignore", sparse=False))
    #OneHotEncoder(drop="first"))

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", num_pipe, X_num_col),
        ("cat_pipe", cat_pipe, X_cat_col)])

Create your preprocessing pipeline, let's say it's full_pipeline.
Fit it to the train set: full_pipeline.fit(X_train)
Transform the train set: X_train_preprocessed = full_pipeline.transform(X_train)
Transform the test set: X_test_preprocessed = full_pipeline.transform(X_test) Note that so far we have not needed neither the y_train nor the y_test. This is because we are not modelling yet. We are just preprocessing data (cleaning, transforming, wrangling, preparing, imputing, encoding... only the predictors! not the target!)
Fit a model to the preprocessed train set:

In [12]:
preprocessor.fit_transform(X_train).shape

(1168, 241)

In [13]:
preprocessor.fit_transform(X_test).shape

(292, 202)

In [16]:
performances = {}

### Quick baseline models (baseline for performance)

In [17]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier

#dt = Decision tree
full_pipe_dt = make_pipeline ( preprocessor, DecisionTreeClassifier())

full_pipe_dt.fit(X_train, y_train)

tree_pred = full_pipe_dt.predict(X_test)

performances["baseline_tree"]= r2_score(y_test, tree_pred)
performances




{'baseline_tree': 0.48161634210415794}

# K Nearest neighbors

from sklearn.neighbors import KNeighborsRegressor
full_pipe_kn = make_pipeline(
    preprocessor,
    KNeighborsRegressor(n_neighbors=3))

full_pipe_kn.fit(X_train, y_train)

kn_pred = full_pipe_kn.predict(X_test)

performances["baseline_kn"]= r2_score(y_test, kn_pred)
performances

In [18]:
# LR
from sklearn.linear_model import LinearRegression
full_pipe_LR = make_pipeline(
    preprocessor,
    LinearRegression())

full_pipe_LR.fit(X_train, y_train)

LR_pred = full_pipe_LR.predict(X_test)

performances["baseline_LR"]= r2_score(y_test, LR_pred)
performances




{'baseline_tree': 0.48161634210415794, 'baseline_LR': -4023534770102086.0}

### PCA

In [19]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier

full_pipe_dt = make_pipeline(
    preprocessor,
    PCA(n_components=0.95),
    DecisionTreeClassifier(max_depth=5))

full_pipe_dt.fit(X_train, y_train)

tree_pred = full_pipe_dt.predict(X_test)

performances["PCA95_tree"]= r2_score(y_test, tree_pred)
performances



{'baseline_tree': 0.48161634210415794,
 'baseline_LR': -4023534770102086.0,
 'PCA95_tree': 0.18281039163329094}

In [20]:
# K Nearest neighbors
from sklearn.neighbors import KNeighborsRegressor
full_pipe_kn = make_pipeline(
    preprocessor,
    PCA(n_components=0.95),
    KNeighborsRegressor(n_neighbors=3))

full_pipe_kn.fit(X_train, y_train)

kn_pred = full_pipe_kn.predict(X_test)

performances["PCA95_kn"]= r2_score(y_test, kn_pred)
performances



{'baseline_tree': 0.48161634210415794,
 'baseline_LR': -4023534770102086.0,
 'PCA95_tree': 0.18281039163329094,
 'PCA95_kn': 0.6793383451055091}

In [None]:
# LR
from sklearn.linear_model import LinearRegression
full_pipe_LR = make_pipeline(
    preprocessor,
    PCA(n_components=0.95),
    LinearRegression())

full_pipe_LR.fit(X_train, y_train)

LR_pred = full_pipe_LR.predict(X_test)

performances["PCA95_LR"]= r2_score(y_test, LR_pred)
performances

In [None]:
preprocessor.fit(X_train)

X_scaled = preprocessor.transform(X_train)

pca = PCA(n_components=None) # pick number of components
X_reduced = pca.fit_transform(X_scaled)
np.round(pca.explained_variance_ratio_, 4)*100

In [None]:
cumsum = np.cumsum(pca.explained_variance_ratio_)

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(cumsum, label = "Explained variance");

In [None]:
cumsum = np.cumsum(pca.explained_variance_ratio_)

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(cumsum, label = "Explained variance");

d = np.argmax(cumsum >= 0.95) # with how many components do we get 0.95 of the variance

ax.plot([d, d], [0, 0.95], "k:")
ax.plot([0, d], [0.95, 0.95], "k:")
ax.plot(d, 0.95, "ko")

ax.set_ylabel("Explained Variance")
ax.set_xlabel("Number of Components");

In [None]:
np.argmax(cumsum >= 0.95) # with how many components do we get 0.95 of the variance