In [1]:
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline

warnings.simplefilter("ignore")
%matplotlib inline
sns.set(style="darkgrid")

In [2]:
# !wget  -O 'train_sem2.csv' -q 'https://www.dropbox.com/s/6dxq90t0prn2vaw/_train_sem2.csv?dl=0'

In [3]:
data = pd.read_csv("train_sem2.csv")
data = data.drop(columns=['Id'])

FileNotFoundError: [Errno 2] No such file or directory: 'train_sem2.csv'

In [None]:
from sklearn.model_selection import train_test_split

x = data.drop(columns=['SalePrice'])
y = data['SalePrice']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

import seaborn as sns
sns.distplot(y_train)

x_train_num = x_train.select_dtypes(include=[np.number]).apply(lambda x: x.fillna(x.mean()))
x_test_num = x_test.select_dtypes(include=[np.number]).apply(lambda x: x.fillna(x.mean()))

num_corr = x_train_num.apply(lambda x: x.corr(y_train)).sort_values(ascending=False)
num_corr = num_corr.sort_values(ascending=False)
sns.barplot(y=num_corr.index, x=num_corr.values).figure.set_size_inches(10,8)

from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
obj_train = scaler.fit_transform(X=x_train_num.values)
obj_test = scaler.transform(X=x_test_num)

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

ridge = Ridge(alpha=0.5)
ridge.fit(obj_train, y_train)

y_pred = ridge.predict(obj_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print('RMSE: ', rmse)

from sklearn.model_selection import cross_val_score
x = pd.concat([x_train_num, x_test_num])
y = pd.concat([y_train, y_test])
cv_score = cross_val_score(ridge, x, y, cv=7, scoring="neg_root_mean_squared_error")
print('RMSE after cross-validation: ', np.mean(-cv_score))

sns.barplot(y=num_data.columns, x=ridge.coef_).figure.set_size_inches(10,8)

from sklearn.model_selection import GridSearchCV
alphas = np.logspace(-2, 3, 20)
searcher = GridSearchCV(Ridge(), [{"alpha": alphas}], scoring="neg_root_mean_squared_error", cv=10)
searcher.fit(obj_train, y_train)

best_alpha = searcher.best_params_['alpha']
print('best alpha: ', best_alpha)

plt.plot(list(searcher.cv_results_['param_alpha']),\
-searcher.cv_results_['mean_test_score'])
plt.xscale('log')

from sklearn.pipeline import Pipeline, make_pipeline
make_pipeline (StandardScaler(), Ridge(alpha=best_alpha))

simple_pipeline = Pipeline([('standardscaler', StandardScaler()),
                ('ridge', Ridge(alpha=best_alpha))])
model = simple_pipeline.fit(obj_train, y_train)
y_pred_pipeline = simple_pipeline.predict(obj_test)
rmse = mean_squared_error(y_test, y_pred_pipeline, squared=False)
print('RMSE: ', rmse)

In [None]:
numerical = list(x_train.select_dtypes(include=np.number))
x_train[numerical] = x_train[numerical].apply(lambda x: x.fillna(x.mean()))
x_test[numerical] = x_test[numerical].apply(lambda x: x.fillna(x.mean()))
categorical = list(set(x_train.columns) - set(numerical))
x_train[categorical] = x_train[categorical].fillna('NotGiven')
x_test[categorical] = x_test[categorical].fillna('NotGiven')

In [None]:
column_transformer = ColumnTransformer([
         ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical),
         ('num', StandardScaler(), numerical)])
pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer),
    ('regression', Ridge())
])

model=pipeline.fit(x_train, y_train)
y_pred = model.predict(x_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print('RMSE: ', rmse)

In [None]:
print("Size before OneHot:", x_train.shape)
print("Size after OneHot:", column_transformer.transform(x_train).shape)

In [None]:
error = (y_train - model.predict(x_train)) ** 2
mask = (error < (np.quantile(error, 0.95)))
print(x_train.shape, x_train[mask].shape)

# plt.scatter(x_train['OverallQual'], y_train)
thershold = 6

mask_train = (x_train['OverallQual'] < thershold)
x_train_1 = x_train[mask_train]
x_train_2 = x_train[~mask_train]

y_train_1 = y_train[mask_train]
y_train_2 = y_train[~mask_train]

mask_test = (x_test['OverallQual'] < thershold)
x_test_1 = x_test[mask_test]
x_test_2 = x_test[~mask_test]

column_transformer_1 = ColumnTransformer([
         ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical),
         ('num', StandardScaler(), numerical)])
column_transformer_2 = ColumnTransformer([
         ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical),
         ('num', StandardScaler(), numerical)])

pipeline_1 = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer_1),
    ('regression', Ridge())
])
pipeline_2 = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer_2),
    ('regression', Ridge())
])

model_1 = pipeline_1.fit(x_train_1, y_train_1)
model_2 = pipeline_2.fit(x_train_2, y_train_2)

y_pred_1 = model_1.predict(x_test_1)
y_pred_2 = model_2.predict(x_test_2)

rmse = mean_squared_error(pd.concat([y_test[mask_test], y_test[~mask_test]]), 
            np.concatenate([y_pred_1, y_pred_2]), squared=False)
print('RMSE: ', rmse)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=10, strategy='quantile')
x_GrLivArea = pd.concat([x_train['GrLivArea'], x_test['GrLivArea']])
x_GrLivArea_discretized = discretizer.fit_transform(x_GrLivArea[:, None]).todense()
x_train = x_train.drop(columns=['GrLivArea'])
x_test = x_test.drop(columns=['GrLivArea'])
for i in range(10):
    x_train ['GrLivArea' + str(i)] = x_GrLivArea_discretized[:1022, i]
    x_test ['GrLivArea' + str(i)] = x_GrLivArea_discretized[1022:, i]

In [None]:
numerical_discr = list(x_train.select_dtypes(include=np.number))
x_train[numerical_discr] = x_train[numerical_discr].apply(lambda x: x.fillna(x.mean()))
x_test[numerical_discr] = x_test[numerical_discr].apply(lambda x: x.fillna(x.mean()))
categorical_discr = list(set(x_train.columns) - set(numerical))
x_train[categorical_discr] = x_train[categorical_discr].fillna('NotGiven')
x_test[categorical_discr] = x_test[categorical_discr].fillna('NotGiven')

In [None]:
column_transformer_discr = ColumnTransformer([
         ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_discr),
         ('num', StandardScaler(), numerical_discr)])
pipeline_discr = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer_discr),
    ('regression', Ridge())
])

model_discr = pipeline_discr.fit(x_train, y_train)
y_pred_discr = model_discr.predict(x_test)
rmse_discr = mean_squared_error(y_test, y_pred_discr, squared=False)
print('RMSE: ', rmse_discr)

In [None]:
plt.scatter(x_test, y_test)
plt.scatter(x_test['GrLivArea1'], y_pred)