In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
pd.pandas.set_option('display.max_columns',None)

In [None]:
## Loading the dataset
DATA_PATH = '../input/house-prices-advanced-regression-techniques/'
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# Explorary Data Analysis

In [None]:
df = pd.concat([df_test.assign(ind="test"), df_train.assign(ind="train")])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
nan_cols = [i for i in df.columns if df[i].isnull().sum()>=1]
print(nan_cols)

In [None]:
pd.set_option('display.max_rows', None)
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values(['percent_missing'], ascending=False)

In [None]:
df = df.loc[:, df.isnull().mean() < .9]
df.shape

In [None]:
df_train_categorical = df.select_dtypes(include=['object']).columns.tolist() 
print(len(df_train_categorical))

In [None]:
df_train_numeric = df.select_dtypes(exclude=['object']).columns.tolist()

In [None]:
df_year_feature = [year_feature for year_feature in df_train_numeric if 'Yr' in year_feature or 'Year' in year_feature ]
print(df_year_feature)

In [None]:
for feature in df_year_feature:
    if feature!= 'YrSold':
        data = df.copy()
        data[feature] = data['YrSold']-data[feature]
        plt.scatter(data[feature], data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()

In [None]:
df_train.duplicated().sum()

In [None]:
discrete_df = [feature for feature in df_train_numeric if len(df[feature].unique())<25 and feature not in df_year_feature+["ID"] ]

In [None]:
for feature in discrete_df:
    N=25
    data=df.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar(color=plt.cm.Paired(np.arange(N)))
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

In [None]:
continous_df = [feature for feature in df_train_numeric if feature not in discrete_df+['Id']+df_year_feature+['SalePrice']]

In [None]:
for feature in continous_df:
    data = df_train.copy()
    data.groupby(feature)['SalePrice'].median().plot.hist()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

In [None]:
for feature in df_train_categorical:
    N=25
    data=df.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar(color=plt.cm.Paired(np.arange(N)))
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

# Feature Engineering
#### in this section we will perform below task
#### 1. Handle Missing values
#### 2. Handle Categorical Variable
#### 3. Handle Imbalance or skewd data
#### 4. Handle Outliers


In [None]:
numerical_with_nan=[feature for feature in df.columns if df[feature].isnull().sum()>1 and df[feature].dtypes!='O']
numerical_with_nan.remove('SalePrice')
numerical_with_nan

In [None]:
for feature in numerical_with_nan:
    ## We will replace by using median since there are outliers
    median_value=df[feature].median()
    df[feature].fillna(median_value,inplace=True)
    
df[numerical_with_nan].isnull().sum()

# 2. Handling Tamporal Variable

In [None]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
       
    df[feature]=df['YrSold']-df[feature]

In [None]:
df[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

In [None]:
for feature in continous_df:
    data=df.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

In [None]:
import numpy as np
num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']

for feature in num_features:
    df[feature]=np.log(df[feature])

In [None]:
for feature in num_features:
    sns.kdeplot(data=df_train, x=feature,  legend=True)
    plt.show()

In [None]:
num_feature=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

def iqr_feature(features, df):
    dict = {}
    max_lmt = []
    min_lmt=[]
    for feature in features:
        q1 = df[feature].quantile(0.25)
        q2 = df[feature].quantile(0.75)
        IQR = q2 - q1
#         iqr.append(IQR)
        max_limit = q2 + (1.5 * IQR)
        max_lmt.append(max_limit)
        min_limit = q1 - (1.5 * IQR) 
        min_lmt.append(min_limit)
#         dict.update({feature: [min_limit,max_limit]})
#     return dict
    return max_lmt, min_lmt
    
max_lmt, min_lmt= iqr_feature(num_feature, df)
min_lmt

In [None]:
s1 = pd.Series(max_lmt)
s2 = pd.Series(min_lmt)


df_min_max = pd.DataFrame(list(zip(num_feature,s1, s2)), columns=['num_features','max_lmt','min_lmt'])
df_min_max
    

In [None]:
df.shape

In [None]:
# df_new = df[(df["LotFrontage"]>3.700798) & (df["LotFrontage"]<4.750255)]
# df_new.shape
df["LotFrontage"]= np.where((df["LotFrontage"]<3.700798), df['LotFrontage'].quantile(0.05),df['LotFrontage'])
df["LotFrontage"]= np.where((df["LotFrontage"]>4.750255), df['LotFrontage'].quantile(0.95),df['LotFrontage'])
df.shape

In [None]:
sns.boxplot( y=df["LotFrontage"]);
plt.show()

In [None]:
df["LotArea"]= np.where((df["LotArea"]<8.65046), df['LotArea'].quantile(0.05),df['LotArea'])
df["LotArea"]= np.where((df["LotArea"]>10.010846), df['LotArea'].quantile(0.95),df['LotArea'])
sns.boxplot( y=df["LotArea"]);
plt.show()

In [None]:
df["1stFlrSF"]= np.where((df["1stFlrSF"]<6.085527), df['1stFlrSF'].quantile(0.05),df['1stFlrSF'])
df["1stFlrSF"]= np.where((df["1stFlrSF"]>7.925098), df['1stFlrSF'].quantile(0.95),df['1stFlrSF'])
df.shape

In [None]:
df["GrLivArea"]= np.where((df["GrLivArea"]<6.370592), df['GrLivArea'].quantile(0.05),df['GrLivArea'])
df["GrLivArea"]= np.where((df["GrLivArea"]>8.119484), df['GrLivArea'].quantile(0.95),df['GrLivArea'])
df.shape

In [None]:
df["SalePrice"]= np.where((df["SalePrice"]<3937.500000), df['SalePrice'].quantile(0.05),df['SalePrice'])
df["SalePrice"]= np.where((df["SalePrice"]>340037.500000), df['SalePrice'].quantile(0.95),df['SalePrice'])
df.shape

In [None]:
df['SalePrice'].tail()

# Handling Rare Categorical Feature

We will remove categorical variables that are present less than 1% of the observations

In [None]:
categorical_features=[feature for feature in df.columns if df[feature].dtype=='O']
categorical_features.remove('ind')

In [None]:
for feature in categorical_features:
    print((df[feature].value_counts()))

In [None]:
for feature in categorical_features:
    temp=df.groupby(feature)['SalePrice'].count()/len(df)
    temp_df=temp[temp>0.01].index
    df[feature]=np.where(df[feature].isin(temp_df),df[feature],'Rare_var')
    

In [None]:
# from sklearn.preprocessing import OrdinalEncoder

# Target Guided Ordinal Encoding
1. Ordering the labels according to the target
2. Replace the labels by the joint probability of being 1 or 0

In [None]:
for feature in categorical_features:
    labels_ordered=df.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    df[feature]=df[feature].map(labels_ordered)

In [None]:
df.head()

In [None]:
test, train = df[df["ind"].eq("test")], df[df["ind"].eq("train")]

In [None]:
test.drop('SalePrice', axis=1, inplace=True)

In [None]:
train.drop('ind', axis=1, inplace=True)

In [None]:
nan_feature = [feature for feature in test.columns if test[feature].isnull().sum()>=1]
nan_feature.remove('GarageCars')

In [None]:
for feature in nan_feature:
    ## We will replace by using median since there are outliers
    median_value=df[feature].median()
    test[feature].fillna(median_value,inplace=True)
    
test[nan_feature].isnull().sum()

In [None]:
md = test['GarageCars'].mode()
print(md)

In [None]:
test['GarageCars'].fillna(2,inplace=True)

In [None]:
test.isnull().sum()

In [None]:
test.drop('ind', axis=1, inplace=True)

In [None]:
y_data=train['SalePrice']
x_data=train.drop('SalePrice',axis=1)

In [None]:
x_data.shape, test.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)
x_train.isnull().sum()

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectPercentile

In [None]:
from sklearn.feature_selection import mutual_info_regression
# determine the mutual information
mutual_info = mutual_info_regression(x_train, y_train)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = x_train.columns
mutual_info.sort_values(ascending=False)

In [None]:
## Selecting the top 80 percentile
selected_top_columns = SelectPercentile(mutual_info_regression, percentile=80)
selected_top_columns.fit(x_train, y_train)

In [None]:
X_train_top_80 = list(x_train.columns[selected_top_columns.get_support()])
len(X_train_top_80)

In [None]:
X_train_feature = x_train[x_train.columns[x_train.columns.isin(X_train_top_80)]]
X_test_feature = x_test[x_test.columns[x_test.columns.isin(X_train_top_80)]]
df_test_feature = test[test.columns[test.columns.isin(X_train_top_80)]]

In [None]:
# df_test_scaled = df_test_scaled.drop(df_test_scaled.index[-1])

# Model Training

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_feature , y_train)
predictions = model.predict(X_test_feature)

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score, r2_score

In [None]:
print('Mean Absolute Error(MAE):', metrics.mean_absolute_error(y_test, predictions))
print('Mean Squared Error(MSE):', metrics.mean_squared_error(y_test, predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('Explained Variance Score (EVS):',explained_variance_score(y_test,predictions))
print('R2:',metrics.r2_score(y_test, predictions))

In [None]:
from sklearn.linear_model import Ridge

model_ridge = Ridge()
model_ridge.fit(X_train_feature , y_train)
ridge_predictions = model_ridge.predict(X_test_feature)

In [None]:
print('Mean Absolute Error(MAE):', metrics.mean_absolute_error(y_test, ridge_predictions))
print('Mean Squared Error(MSE):', metrics.mean_squared_error(y_test, ridge_predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, ridge_predictions)))
print('Explained Variance Score (EVS):',explained_variance_score(y_test,ridge_predictions))
print('R2:',metrics.r2_score(y_test, ridge_predictions))
print('R2 rounded:',(metrics.r2_score(y_test, ridge_predictions)).round(2))
r2 = r2_score(y_test, ridge_predictions)
r2_rounded = r2_score(y_test, ridge_predictions).round(2)

In [None]:
from sklearn.tree import DecisionTreeRegressor
model_DecisionTree = DecisionTreeRegressor()
model_DecisionTree.fit(X_train_feature , y_train)
DecisionTree_predictions = model_DecisionTree.predict(X_test_feature)

In [None]:
print('Mean Absolute Error(MAE):', metrics.mean_absolute_error(y_test, DecisionTree_predictions))
print('Mean Squared Error(MSE):', metrics.mean_squared_error(y_test, DecisionTree_predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, DecisionTree_predictions)))
print('Explained Variance Score (EVS):',explained_variance_score(y_test,DecisionTree_predictions))
print('R2:',metrics.r2_score(y_test, DecisionTree_predictions))
print('R2 rounded:',(metrics.r2_score(y_test, DecisionTree_predictions)).round(2))
r2 = r2_score(y_test, DecisionTree_predictions)
r2_rounded = r2_score(y_test, DecisionTree_predictions).round(2)

In [None]:
n_estimators = [5,20,50,100] # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap}

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

In [None]:
rf_random.fit(X_train_feature , y_train)
RandomForest_predictions = rf_random.predict(X_test_feature)

In [None]:
print('Mean Absolute Error(MAE):', metrics.mean_absolute_error(y_test, RandomForest_predictions))
print('Mean Squared Error(MSE):', metrics.mean_squared_error(y_test, RandomForest_predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, RandomForest_predictions)))
print('Explained Variance Score (EVS):',explained_variance_score(y_test,RandomForest_predictions))
print('R2:',metrics.r2_score(y_test, RandomForest_predictions))
print('R2 rounded:',(metrics.r2_score(y_test, RandomForest_predictions)).round(2))
r2 = r2_score(y_test, RandomForest_predictions)
r2_rounded = r2_score(y_test, RandomForest_predictions).round(2)

In [None]:
predictions = rf_random.predict(df_test_feature)

In [None]:
submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})

In [None]:
submission.to_csv(path_or_buf = 'submission.csv', \
                   sep = ',', index = False)

In [None]:
submission