In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Call functions and load data
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
# we create dummies for these, either to small number of categories or some speciality e.g. "MSSubClass"
numeric_but_string=["MSSubClass","YrSold","BsmtFullBath","BsmtHalfBath","BedroomAbvGr",
                    "FullBath","HalfBath","Fireplaces","GarageCars","KitchenAbvGr"]


# We check the skewness of numeric cols and we fill miss these as well
numerical_cols = [ col for col in train.columns[1:-2] if ((train.dtypes[col]=="int64"
                                                           or  train.dtypes[col]=="float64") and col not in numeric_but_string)]


target_col="SalePrice_log"
train["SalePrice_log"]=np.log(train["SalePrice"])
train_tf=train[["Id"]].copy()
test_tf=test[["Id"]].copy()
std_map={}

In [None]:
# I create a new data set, so I can later rescale everything...
numerical_variables_revisited=list()
for col in train.columns:
    if col not in ["Id"]:
        # we copy the numerical or close to numerical variables
        if (train.dtypes[col]=="int64" or  train.dtypes[col]=="float64") and col not in numeric_but_string:
            train_tf[col]=train[col].copy()
            if train.dtypes[col]=="int64":
                print(col+" "+str(len(train[col].unique())))
            if col not in ["SalePrice_log","SalePrice"]:
                test_tf[col]=test[col].copy()
                numerical_variables_revisited.append(col)
        else:
            train_tf=pd.concat([train_tf, pd.get_dummies(train[col],prefix=col, prefix_sep='_',)], axis=1)
            test_tf=pd.concat([test_tf, pd.get_dummies(test[col],prefix=col, prefix_sep='_',)], axis=1)

    if col=="PoolArea":
        # My Hungarian thinking suggests that we should only check whether this exists...
        train_tf["Pool_Exists"]=np.where(train["PoolArea"]>0,1,0)
        test_tf["Pool_Exists"]=np.where(test["PoolArea"]>0,1,0)
        
# I drop the id, as we try to keep the ordering.
train_tf.drop(columns=["Id"],inplace=True)
test_tf.drop(columns=["Id"],inplace=True)

for col in train_tf.columns:
    if col not in test_tf.columns:
        test_tf[col]=0
        
for col in test_tf.columns:
    if col not in train_tf.columns:
        train_tf[col]=0
        
train_target=train_tf[[target_col]].copy()
train_tf=train_tf[test_tf.columns].copy()

In [None]:
# Now we apply a scaler to scale everything to 0-1
saved_cols = train_tf.columns
scaler = StandardScaler()
scaler=scaler.fit(train_tf)
train_scaled=pd.DataFrame(scaler.transform(train_tf))
train_scaled.columns=saved_cols
test_scaled=pd.DataFrame(scaler.transform(test_tf))
test_scaled.columns=saved_cols

saved_cols2 = train_target.columns
target_scaler= StandardScaler()
target_scaled=pd.DataFrame(target_scaler.fit_transform(train_target))
target_scaled.columns=saved_cols2

In [None]:
# Now we apply a scaler to scale everything to 0-1
saved_cols = train_tf.columns
scaler = StandardScaler()
scaler=scaler.fit(train_tf)
train_scaled=pd.DataFrame(scaler.transform(train_tf))
train_scaled.columns=saved_cols
test_scaled=pd.DataFrame(scaler.transform(test_tf))
test_scaled.columns=saved_cols

target_scaler= StandardScaler()
target_scaled=pd.DataFrame(target_scaler.fit_transform(train_target))
target_scaled.columns=[target_col]

In [None]:
# missing value treatment and skew correction
house_df_tf=train_scaled.append(test_scaled).copy()
imputer = KNNImputer(n_neighbors=10, weights='distance')
house_df_tf.loc[:,numerical_variables_revisited] = imputer.fit_transform(house_df_tf.loc[:,numerical_variables_revisited])

for col in numerical_variables_revisited:
    if house_df_tf[col].skew()>3:
        print(col+" Init skew: "+str(round(house_df_tf[col].skew(),2)))
        house_df_tf[col]=np.log(house_df_tf[col]+1)
        print("After skew: "+str(round(house_df_tf[col].skew(),2)))

train_scaled=house_df_tf.head(len(train_scaled)).copy()
test_scaled=house_df_tf.tail(len(test_scaled)).copy()

In [None]:
# before feature generation
correlation_matrix=train_scaled[[target_col]+numerical_variables_revisited].corr()
with_targe_variable=correlation_matrix[[target_col]].reset_index()
with_targe_variable.sort_values(by=target_col, ascending=False, inplace=True)

sns.set(rc = {'figure.figsize':(18,8)})
sns.set_theme(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.barplot(x="index", y=target_col, 
                 data=with_targe_variable[with_targe_variable["index"]!=target_col])
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90);
plt.title("Correlation with the logarithm of the sales price", fontsize=20);

In [None]:
# We try to generate new numerical columns: 
def identity(x):
    return x
function_forms={
"sqrt":np.sqrt,
"log":np.log1p,
"id":identity
}
numerical_variables_revisited_update=numerical_variables_revisited.copy()
for v in numerical_variables_revisited:
    for w in numerical_variables_revisited:
        for key in function_forms.keys():
            combined_v=abs(np.corrcoef(train_scaled[target_col], function_forms[key](train_scaled[v]*train_scaled[w]))[0,1])
            corr_1=abs(np.corrcoef(train_scaled[target_col], train_scaled[w])[0,1])
            corr_2=abs(np.corrcoef(train_scaled[target_col], train_scaled[v])[0,1])
            if combined_v>max(corr_1,corr_2):
                print("New variable generate: ")
                print("Var1: "+w+ " Var2: "+v+" function: "+key)
                print("With correlation: "+str(round(combined_v,3)))
                print("Var 1 correlation: "+str(round(corr_1,3)))
                print("Var 1 correlation: "+str(round(corr_2,3)))
                train_scaled[w+"_"+v+"_"+key]=function_forms[key](train_scaled[v]*train_scaled[w])
                test_scaled[w+"_"+v+"_"+key]=function_forms[key](test_scaled[v]*test_scaled[w])
                train_scaled[w+"_"+v+"_"+key].fillna(0,inplace=True)
                test_scaled[w+"_"+v+"_"+key].fillna(0,inplace=True)
                numerical_variables_revisited_update.append(w+"_"+v+"_"+key)

In [None]:
# the variables with highest correlation, lets plot them!
selected_vars=list(
    with_targe_variable[with_targe_variable["index"]!=target_col].head(5)["index"])

for var in selected_vars:
    sns.scatterplot(data=train_scaled, x=target_col, y=var)
    plt.title(target_col+" and variable: "+var, fontsize=20)
    plt.show()

In [None]:
# test , train data
regressor_cols=list(set(train_scaled.columns)-set(["SalePrice","SalePrice_log"]))

X_train, y_train = train_scaled[regressor_cols], target_scaled[[target_col]]
X_test = test_scaled[regressor_cols]

In [None]:
# we search using the test version....
model = ElasticNet()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = [0.0001, 0.0002,0.0004, 0.0006, 0.0008, 0.001, 0.003, 0.006, 0.009,0.012]
grid['l1_ratio'] = np.arange(0.79, 1.00, 0.01) # closer to lasso
grid['max_iter']=[50000]

# define search
search = GridSearchCV(model, grid, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=10, verbose=1)
results = search.fit(X_train, y_train,sample_weight=target_scaler.inverse_transform(y_train["SalePrice_log"]))

In [None]:
print('MRSE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
model = ElasticNet(alpha=results.best_params_['alpha'],  l1_ratio=results.best_params_['l1_ratio'], max_iter=50000 )
model_rec = ElasticNet(alpha=results.best_params_['alpha'],  l1_ratio=results.best_params_['l1_ratio'] )
# model = ElasticNet(alpha=0.012,  l1_ratio=0.79, max_iter=50000 )
# model_rec = ElasticNet(alpha=0.012,  l1_ratio=0.79 )

In [None]:
# we check how this settings look like in other samples and in a somewhat out of sample setting
n=100
avg_insample_score=0
avg_out_of_sample_score=0
avg_insample_r2=list()
avg_outofsample_r2=list()
experiment_data=pd.concat([train_scaled[regressor_cols],target_scaled],axis=1).copy()
for i in range(0,n):
    t1, t2 = train_test_split(experiment_data, test_size=0.33)
    X1, y1 = t1[regressor_cols], t1[[target_col]]
    X_t2, y_t2 = t2[regressor_cols], t2[[target_col]]
    model_rec.fit(X1, y1,sample_weight=target_scaler.inverse_transform(y1["SalePrice_log"]))
    avg_insample_score+=(model_rec.score(X1,y1))/n
    avg_out_of_sample_score+=(model_rec.score(X_t2,y_t2))/n
    
    y_in_pred=np.matrix(target_scaler.inverse_transform(model_rec.predict(X1))).T
    y_in_real=target_scaler.inverse_transform(y1)
    y_ou_pred=np.matrix(target_scaler.inverse_transform(model_rec.predict(X_t2))).T
    y_ou_real=target_scaler.inverse_transform(y_t2)
    
    avg_insample_r2.append(1-np.nansum(np.power(y_in_real-y_in_pred,2))/np.nansum((y_in_real-np.nanmean(y_in_real))**2))
    avg_outofsample_r2.append(1-np.nansum(np.power(y_ou_pred-y_ou_real,2))/np.nansum((y_ou_real-np.nanmean(y_ou_real))**2))

In [None]:
print("Average in sample score: "+str(avg_insample_score))
print("Average out of sample score: "+str(avg_out_of_sample_score))
print("Average in sample r2 score: "+str(np.nanmean(avg_insample_r2)))
print("Average out of sample r2 score: "+str(np.nanmean(avg_outofsample_r2)))

In [None]:
plt.hist(avg_outofsample_r2, color="purple");
plt.title("Out of sample R2 distribution", fontsize=20);

In [None]:
plt.hist(avg_insample_r2, color="purple");
plt.title("In sample R2 distribution", fontsize=20);

In [None]:
model=model.fit( 
    X_train,
    y_train,sample_weight=target_scaler.inverse_transform(y_train[target_col]))

feature_importance = pd.Series(index = 
                               X_train.columns, 
                               data = np.abs(model.coef_))

n_selected_features = (feature_importance>0).sum()
print('{0:d} features, reduction of {1:2.2f}%'.format(
    n_selected_features,(1-n_selected_features/len(feature_importance))*100))

feature_importance.sort_values().tail(30).plot(kind = 'bar', figsize = (18,6), color="purple");
plt.title("Feature importance", fontsize=20);

In [None]:
# We generate the predictions.
if target_col=="SalePrice":
    yhat = target_scaler.inverse_transform(model.predict(X_test))
    guess=pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": yhat
       })
else:
    yhat = target_scaler.inverse_transform(model.predict(X_test))
    guess=pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": np.exp(yhat)
       })        
guess.to_csv("submission_noscale.csv", index=False)

# now we scale our estimates a little bit, we apply Jensen inequality
yhat_train = np.exp(target_scaler.inverse_transform(model.predict(X_train)))
y_orig=np.exp(target_scaler.inverse_transform(target_scaled[[target_col]]))


rescale=pd.concat([pd.DataFrame(y_orig),pd.DataFrame(yhat_train)],axis=1)
rescale.columns=["SalePrice","SalePrice_predict"]
rescale["diff"]=rescale["SalePrice"]/rescale["SalePrice_predict"]

guess2=guess.copy()
guess2["SalePrice"]=guess2["SalePrice"]*np.mean(rescale["diff"])
guess2.to_csv("submission.csv", index=False)
print("No scaling: ")
display(guess2.head())
print(" With Scaling: ")
display(guess.head())
print("Scaling scalar")
print("Value: "+str(round(np.mean(rescale["diff"]),3)))