In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectFpr, f_regression
from sklearn.cluster import MeanShift

In [None]:
data = pd.read_csv('../input/california-housing-prices/housing.csv')

In [None]:
# High-level view of the data

data.head().T
data.info()

In [None]:
# Formatting the columns: Not required

In [None]:
# Splitting the data
y_col = ['median_house_value']
data_x = data.loc[:, [col for col in data.columns if col not in y_col]]
data_y = data.loc[:, [col for col in data.columns if col in y_col]]
train_x, test_x, train_y, test_y = train_test_split(
    data_x,
    data_y,
    test_size = 0.25,
    random_state = 42,
    shuffle = True)
all_cols = list(data_x.columns)
numeric_cols = [col for col in all_cols if data_x[col].dtypes != object]
cat_cols = [col for col in all_cols if data_x[col].dtypes == object]

In [None]:
# Derive additional features: Manual

datasets = [train_x, test_x]
colname = 'pop_per_household'
for data in datasets:
    data[colname] = data['households'] / data['population']
numeric_cols += [colname]

In [None]:
# Derive additional features: Systematic: Kernel-based: None


In [None]:
# Derive additional features: Systematic: Non Kernel-based: None

In [None]:
# Exploratory Data Analysis: Univariate

def return_outlier_perc(ser, threshold = 3):
    nm_mask = ~pd.isna(ser)
    ser_nm = ser.loc[nm_mask]
    z_ser_nm = (ser_nm - ser_nm.mean()) / ser_nm.std()
    outliers = ~z_ser_nm.apply(lambda x: True if -threshold < x < +threshold else False)
    return outliers.sum() / ser_nm.shape[0]
def return_negatives_perc(ser):
    nm_mask = ~pd.isna(ser)
    ser_nm = ser.loc[nm_mask]
    neg_mask = ser_nm.apply(lambda x: True if x < 0 else False)
    return neg_mask.sum() / ser_nm.shape[0]
def return_positives_perc(ser):
    nm_mask = ~pd.isna(ser)
    ser_nm = ser.loc[nm_mask]
    pos_mask = ser_nm.apply(lambda x: True if x > 0 else False)
    return pos_mask.sum() / ser_nm.shape[0]
univariate_numeric = train_x.loc[:, numeric_cols].describe().T
univariate_numeric['perc_missing'] = 1 - univariate_numeric['count'] / train_x.shape[0]
univariate_numeric['skew'] = [train_x[col].skew() for col in univariate_numeric.index]
univariate_numeric['kurt'] = [train_x[col].kurt() for col in univariate_numeric.index]
univariate_numeric['cov'] = univariate_numeric['std'] / univariate_numeric['mean']
univariate_numeric['perc_outliers'] = [return_outlier_perc(train_x[col]) for col in univariate_numeric.index]
univariate_numeric['perc_negatives'] = [return_negatives_perc(train_x[col]) for col in univariate_numeric.index]
univariate_numeric['perc_positives'] = [return_positives_perc(train_x[col]) for col in univariate_numeric.index]

univariate_categorical = train_x.loc[:, cat_cols].describe().T
univariate_categorical['perc_missing'] = 1 - univariate_categorical['count'] / train_x.shape[0]
univariate_categorical['top_freq'] = univariate_categorical['freq'] / univariate_categorical['count']

In [None]:
univariate_numeric

In [None]:
# Prepare data for bivariate and row-wise analysis

train_x_temp = train_x.copy(deep = False)
missing_cols = []
constant_cols = []
irrelevant_cols = []
cols_to_remove = missing_cols + constant_cols + irrelevant_cols
train_x_temp = train_x_temp.loc[:, [col for col in train_x_temp.columns if col not in cols_to_remove]]
nm_mask = ~(pd.isna(train_x_temp).any(axis = 1))
train_x_temp = train_x_temp.loc[nm_mask, :]
identifier = train_x_temp.index
all_cols_temp = list(train_x_temp.columns)
numeric_cols_temp = [col for col in numeric_cols if col not in cols_to_remove]
cat_cols_temp = [col for col in cat_cols if col not in cols_to_remove]
numeric_data = train_x_temp.loc[:, [col for col in train_x_temp.columns if col in numeric_cols_temp]]
cat_data = train_x_temp.loc[:, [col for col in train_x_temp.columns if col in cat_cols_temp]]
ohe = OneHotEncoder(sparse = False)
cat_ohe_data = pd.DataFrame(ohe.fit_transform(X = cat_data.values), index = identifier, columns = ohe.get_feature_names()) 
train_x_analysis = pd.concat(
    objs = (numeric_data, cat_ohe_data),
    axis = 1,
    join = 'outer',
    ignore_index = False,
    copy = True)
train_x_analysis.index = identifier

In [None]:
# Exploratory Data Analysis: Bivariate
train_y_analysis = train_y.loc[nm_mask]
def return_scores(y):
    fs_fpr = SelectFpr(
        score_func = f_regression,
        alpha = 0.05)
    fs_fpr.fit(X = train_x_analysis, y = y)
    return fs_fpr.scores_
bivariate_x_y = pd.DataFrame(return_scores(y = train_y_analysis), index = train_x_analysis.columns, columns = ['Importances'])
bivariate_x_y.sort_values(
    by = 'Importances',
    axis = 0,
    ascending = False,
    inplace = True)
bivariate_x_x = pd.DataFrame(
    [return_scores(y = train_x_analysis[col]) for col in train_x_analysis], 
    index = train_x_analysis.columns,
    columns = train_x_analysis.columns)

In [None]:
# Exploratory Data Analysis: Row-wise analysis

train_x_normalized = pd.DataFrame(
    StandardScaler().fit_transform(X = train_x_analysis.values), 
    index = identifier, 
    columns = train_x_analysis.columns)
groups = MeanShift().fit_predict(X = train_x_analysis)