In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split ,KFold,StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.feature_selection import mutual_info_regression

from collections import Counter

import math 
from scipy import stats as ss
#import scipy.stats as ss


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


def convert(data, to):
    converted = None
    if to == 'array':
        if isinstance(data, np.ndarray):
            converted = data
        elif isinstance(data, pd.Series):
            converted = data.values
        elif isinstance(data, list):
            converted = np.array(data)
        elif isinstance(data, pd.DataFrame):
            converted = data.as_matrix()
    elif to == 'list':
        if isinstance(data, list):
            converted = data
        elif isinstance(data, pd.Series):
            converted = data.values.tolist()
        elif isinstance(data, np.ndarray):
            converted = data.tolist()
    elif to == 'dataframe':
        if isinstance(data, pd.DataFrame):
            converted = data
        elif isinstance(data, np.ndarray):
            converted = pd.DataFrame(data)
    else:
        raise ValueError("Unknown data conversion: {}".format(to))
    if converted is None:
        raise TypeError('cannot handle data conversion of type: {} to {}'.format(type(data),to))
    else:
        return converted
    
def conditional_entropy(x, y):
    """
    Calculates the conditional entropy of x given y: S(x|y)
    Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy
    :param x: list / NumPy ndarray / Pandas Series
        A sequence of measurements
    :param y: list / NumPy ndarray / Pandas Series
        A sequence of measurements
    :return: float
    """
    # entropy of x given y
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x,y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0.0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y/p_xy)
    return entropy

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

def theils_u(x, y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = numerator/denominator
    return eta

def categoric_correlation(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True,
                          return_results = False, **kwargs):
    """
    Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and
    continuous features using:
     - Pearson's R for continuous-continuous cases
     - Correlation Ratio for categorical-continuous cases
     - Cramer's V or Theil's U for categorical-categorical cases
    :param dataset: NumPy ndarray / Pandas DataFrame
        The data-set for which the features' correlation is computed
    :param nominal_columns: string / list / NumPy ndarray
        Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
        columns are categorical, or None (default) to state none are categorical
    :param mark_columns: Boolean (default: False)
        if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or
        continuous), as provided by nominal_columns
    :param theil_u: Boolean (default: False)
        In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V
    :param plot: Boolean (default: True)
        If True, plot a heat-map of the correlation matrix
    :param return_results: Boolean (default: False)
        If True, the function will return a Pandas DataFrame of the computed associations
    :param kwargs:
        Arguments to be passed to used function and methods
    :return: Pandas DataFrame
        A DataFrame of the correlation/strength-of-association between all features
    """

    dataset = convert(dataset, 'dataframe')
    columns = dataset.columns
    if nominal_columns is None:
        nominal_columns = list()
    elif nominal_columns == 'all':
        nominal_columns = columns
    corr = pd.DataFrame(index=columns, columns=columns)
    for i in range(0,len(columns)):
        for j in range(i,len(columns)):
            if i == j:
                corr[columns[i]][columns[j]] = 1.0
            else:
                if columns[i] in nominal_columns:
                    if columns[j] in nominal_columns:
                        if theil_u:
                            corr[columns[j]][columns[i]] = theils_u(dataset[columns[i]],dataset[columns[j]])
                            corr[columns[i]][columns[j]] = theils_u(dataset[columns[j]],dataset[columns[i]])
                        else:
                            cell = cramers_v(dataset[columns[i]],dataset[columns[j]])
                            corr[columns[i]][columns[j]] = cell
                            corr[columns[j]][columns[i]] = cell
                    else:
                        cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
                else:
                    if columns[j] in nominal_columns:
                        cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
                    else:
                        cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
    corr.fillna(value=np.nan, inplace=True)
    if mark_columns:
        marked_columns = ['{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns]
        corr.columns = marked_columns
        corr.index = marked_columns
    if plot:
        plt.figure(figsize=(15,10))#kwargs.get('figsize',None))
        sns.heatmap(corr, annot=kwargs.get('annot',True), fmt=kwargs.get('fmt','.2f'), cmap='coolwarm')
        plt.show()
    if return_results:
        return corr
    
def bar_plot(variable):

    # get feature
    var = train[variable]
    # count number of categorical variable(value/sample)
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize = (9,3))
    plt.bar(varValue.index, varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,varValue))
    


### READ DATASET

In [None]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.describe().T

# EDA & PREPROCESSING

### Check duplicates and missing values.

In [None]:
train.duplicated().sum()

In [None]:
train.isnull().sum().sort_values()

In [None]:
(train.isnull().sum().sort_values()/train.shape[0])*100

In [None]:
plt.figure(figsize=(7,7))
sns.boxplot(data=train["SalePrice"])

### Delete too much missing values columns

In [None]:
train_ID = train['Id']
test_ID = test['Id']

train=train.drop(['FireplaceQu','LotFrontage', 'PoolQC','MiscFeature','Alley','Fence', 'Id'],axis=1)
test=test.drop(['FireplaceQu','LotFrontage', 'PoolQC','MiscFeature','Alley','Fence', 'Id'],axis=1)

# EDA

### Numeric Features

In [None]:
#num_cols = [col for col in train.columns if train[col].dtype in ['int64', 'float64']]
#cat_cols = [col for col in train.columns if train[col].dtype in ['object','str']]

num_cols=[ feature  for feature in train.columns if  train[feature].dtypes!="object" and feature!="SalePrice"]
cat_cols=[ feature  for feature in train.columns if  train[feature].dtypes=="object"]

discrete_numeric_columns = ['OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath',
                'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'MoSold', 'YrSold']

continuous_numeric_columns=[]
for i in num_cols:
    if i not in discrete_numeric_columns:
        continuous_numeric_columns.append(i)
        
train[continuous_numeric_columns]

### Determine best correlated features

In [None]:
c=pd.Series(abs(pd.get_dummies(train).corr()['SalePrice']).sort_values(ascending=False))
c.head(10)

In [None]:
fig=plt.figure(figsize=(20,30))

for index, col in enumerate(continuous_numeric_columns):
    plt.subplot(6,4,index+1)
    sns.distplot(train[col].dropna())
fig.tight_layout(pad=2.0)

In [None]:
sns.set_style()
figure2=plt.figure(2,figsize=(20,20))
plt.subplot(3,2,1)
sns.violinplot(data=train,x="SalePrice",color="skyblue")
plt.subplot(3,2,2)
sns.violinplot(data=train,x="1stFlrSF",color="skyblue")
plt.subplot(3,2,3)
sns.violinplot(data =train, x="TotalBsmtSF",color="skyblue")
plt.subplot(3,2,4)
sns.violinplot(data =train, x="YearBuilt",color="skyblue")
figure2.tight_layout(pad=3.0)
plt.show()

### Categoric Features

In [None]:
#Functions for plot correlation of categoric, numeric and target features in a heatmap
garage_cols=train[['SalePrice', 'GarageYrBlt', 'GarageCars', 'GarageArea']]
catcols=list(garage_cols.select_dtypes(['object']).columns)
garage_cols = garage_cols.fillna(0) # If there are null values, function doesn't work
results = categoric_correlation(garage_cols,nominal_columns=catcols, return_results=True)

In [None]:
fig = plt.figure(figsize=(18,20))
for cols in cat_cols:
    bar_plot(cols)

fig.tight_layout(pad=1.0)

### Mean price deviations by category

In [None]:
deviations=plt.figure(2,figsize=(20,20))
moy=train['SalePrice'].mean()

plt.subplot(3,3,1)
plt.title("OverallQual")
(train.groupby('OverallQual')["SalePrice"].mean()-moy).plot(kind='bar',color='Purple',ls='dashed',edgecolor='Black')
plt.axhline(y=0)

plt.subplot(3,3,2)
plt.title("GarageCars")
(train.groupby('GarageCars')["SalePrice"].mean()-moy).plot(kind='bar',color='Purple',ls='dashed',edgecolor='Black')
plt.axhline(y=0)

plt.subplot(3,3,3)
plt.title("TotRmsAbvGrd")
(train.groupby('TotRmsAbvGrd')["SalePrice"].mean()-moy).plot(kind='bar',color='Purple',ls='dashed',edgecolor='Black')
plt.axhline(y=0)


plt.subplot(3,3,4)
plt.title("FullBath")
(train.groupby('FullBath')["SalePrice"].mean()-moy).plot(kind='bar',color='Purple',ls='dashed',edgecolor='Black')
plt.axhline(y=0)


deviations.tight_layout(pad=2.0)
plt.show()

### Fill null values

In [None]:
nan_count=100*train.isna().sum().sort_values(ascending=False)/train.shape[0]
fig=px.bar(x=nan_count.index,y=nan_count.values, labels={"y": "Nan ammount (%)","x": "Feature"})
fig.show()

In [None]:
#replacing train NaNs with modes
nans=train.isna().sum()
nans=nans[nans>0]
for c in nans.index:
    train[c] = train[c].fillna(train[c].mode()[0])
    
#replacing test NaNs with modes
nans=test.isna().sum()
nans=nans[nans>0]
for c in nans.index:
    test[c] = test[c].fillna(test[c].mode()[0])

In [None]:
for feature in cat_cols:    
    #some string values are present only in one of the dataset, so it is needed an unique list of both dataset to avoid conflicts
    for num, value in enumerate(np.unique((list(train[feature].unique())+list(test[feature].unique())))):          
        train[feature+"_"+str(num)]=pd.Series(train[feature]==value,dtype="int")        
        test[feature+"_"+str(num)]=pd.Series(test[feature]==value,dtype="int")
    train=train.drop(columns=feature)
    test=test.drop(columns=feature)

In [None]:
scaler=StandardScaler()
train[num_cols]=scaler.fit_transform(train[num_cols])
test[num_cols]=scaler.transform(test[num_cols])

### Detect and Drop Outliers

In [None]:
outliers = IsolationForest(n_estimators=30,random_state=1234).fit_predict(train)
train.drop(np.where(outliers==-1)[0],inplace=True)

In [None]:
x_train=train.drop(columns="SalePrice")
y_train=train['SalePrice']
pca = PCA(n_components=train.shape[1]-1)
x_train=pca.fit_transform(x_train)
fig=go.Figure()
fig.add_traces(go.Bar(x=np.arange(train.shape[1]-1),y=np.cumsum(pca.explained_variance_ratio_),name="Cumulative Variance"))
#n_comp will be the number of components that explains the 99% of the data variance
n_comp=np.where(np.cumsum(pca.explained_variance_ratio_)>0.99)[0][0]
fig.add_traces(go.Scatter(x=np.arange(train.shape[1]-1),y=[0.99]*(train.shape[1]-1),name="Variance at 99%"))
fig.update_layout(title="How many components we need?",xaxis_title="Components",yaxis_title="Cumulative Variance", font=dict(
        family="Arial",
        size=18,
    ))
fig.show()
print("With n_components="+str(n_comp)+" we have the 99% of the data variance, so we will choose this value.")

In [None]:
pca = PCA(n_components=n_comp)
x_train=pca.fit_transform(train.drop(columns=["SalePrice"]))

# MODELING

In [None]:
test=pca.transform(test)

In [None]:
predictions = [] 
train_p=[]
scores = []
kf =  StratifiedKFold(n_splits=10, shuffle=True, random_state=1234) 
for fold, (idx_train, idx_valid) in enumerate(kf.split(train,y_train)):
    X_tr, y_tr = x_train[idx_train], y_train.iloc[idx_train]
    X_val, y_val = x_train[idx_valid], y_train.iloc[idx_valid]
    gbr=GradientBoostingRegressor(random_state=1234,subsample=0.6)
    lr= LinearRegression()
    lr.fit(X_tr,y_tr)
    gbr.fit(X_tr,y_tr)
    val_pred_lr = lr.predict(X_val)
    val_pred_gbr = gbr.predict(X_val)
    score_lr = r2_score(y_val, val_pred_lr)
    score_gbr = r2_score(y_val, val_pred_gbr)
    scores.append((score_lr,score_gbr))
    print(f"Fold: {fold + 1} roc_auc Score is : {(score_lr,score_gbr)}")
    #ensembling the methods
    test_preds = lr.predict(test)*0.4 + gbr.predict(test)*0.6
    predictions.append(test_preds)
    train_p.append(lr.predict(x_train)*0.4 + gbr.predict(x_train)*0.6)
print(f" mean Validation R2 : {np.mean(scores)}")

In [None]:
mi=pd.Series(mutual_info_regression(train.drop(columns="SalePrice"),np.mean(train_p,axis=0)))
mi.index=train.columns[:-1]
#ordering values to be plotted in descending order
mi=mi.sort_values(ascending=False)
fig = go.Figure()
fig.add_trace(go.Bar(name="Mutual Information (MI)", x=mi.index[:20], y=mi.values[:20]))
fig.update_layout(title="Mutual Information-How relevant are the features  for the ensembled method?")
fig.show()

### **This is my first notebook try. If you think this is useful, please give me upvote.**