# HOUSE PRICES: ADVANCE REGRESSION TECHNIQUES

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

## Importing test and train files
### To remove skewness in SalePrice I took log

In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df_y=np.log(df['SalePrice'])

## Dividing all features into either numerical or categorical feature group

In [3]:
x=df.dtypes
x=pd.DataFrame(data=x,index=None).reset_index()
categorical_features=x[x.iloc[:,1]=='object']['index']
numerical_feature=df.columns.drop(categorical_features).drop(['SalePrice'])

## Handling Missing values
   ### For categorical features, I assumed most of the values that are missing tells us that those features are not present in house. Therefore i replaced them with 'None'
   ### For numerical features, I replaced missing values with the mean of column

In [5]:
combine = [df,test]
for data in combine:
    data[categorical_features]=data[categorical_features].fillna('None')
    for column in numerical_feature:
        data[column]=data[column].fillna((data[column].mean()))

## Handling Outliers
### Using IQR Method

In [6]:
for col in numerical_feature:
    IQR=df.describe().iloc[4:7,:][col][2]-df.describe().iloc[4:7,:][col][0]
    Ub=df.describe().iloc[4:7,:][col][2]+IQR*1.5
    lb=df.describe().iloc[4:7,:][col][0]-IQR*1.5
    df[df[col]>Ub][col]=Ub
    df[df[col]<lb][col]=lb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Target Encoding
### I encoded categorical feature by replacing each unique value of a features, by corresponding average of SalesPrice 

In [7]:
dict={}
for column in categorical_features:
    dict.update(df.groupby(column)['SalePrice'].mean())
    df[column]=df[column].map(dict)
    test[column]=test[column].map(dict)

## Feature Selection

## First I divide all features in two list,
## all_column contains all those features which are not correlated with any other features,
## group contains all those groups of column that have correlation above 0.4

In [8]:
x=df.corr(method='spearman')

x=pd.DataFrame(x)
group=[]
unique=[]
corr=[0,0,0]

for i in range(81):
    unique.append(i)
    for j in unique:
        
        if abs(x.iloc[i,int(j)])>0.4 and abs(x.iloc[i,int(j)])<1:
            corr=np.vstack((corr,[x.index[j],x.index[i],x.iloc[i,j]]))
            corr=np.vstack((corr,[x.index[j],x.index[j],x.iloc[j,j]]))
            
            unique.pop()
            
all_column=[x.index[i] for i in unique]
corr=pd.DataFrame(corr).drop(0)

for x in corr.iloc[:,0].unique():
    group.append(corr[corr.iloc[:,0]==x][1].unique())
all_column=[col for col in all_column if col not in corr.iloc[:,0].unique()]

## Now from every group of features in group, I took that one feature which has most influence on SalePrice, and termed that as imp_col

In [9]:
imp_col=[]
for groups in group:
    
    mse=-1
    columns='none'
    for col in groups:
        x_train=df[col].fillna(0)
        clf = DecisionTreeRegressor(random_state=0)
        clf.fit(x_train.to_frame(),df_y)
        y_pred= clf.predict(x_train.to_frame())
        if mean_squared_error(df['SalePrice'], y_pred)> mse:
            columns=col
            mse=mean_squared_error(df['SalePrice'], y_pred)
    imp_col.append(columns)

## Now I added both all_column and imp_col

In [10]:
all_column=[col for col in all_column if col not in corr.iloc[:,0].unique()]
all_column=all_column + imp_col
all_column.append('SalePrice')
all_column.append('OverallQual')

## Here again I check for any correlation

In [11]:
corr=df[all_column].corr(method='spearman')['SalePrice'].sort_values()
final_col=corr.index.where(corr>0.3).dropna().drop('SalePrice')

final_col=final_col.drop('Exterior1st')


## Standard Scaling

In [12]:
X_train=df[final_col]
X_test=test[final_col]

sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)


X_test=pd.DataFrame(X_test,columns=final_col)
X_train=pd.DataFrame(X_train,columns=final_col)

X_test=X_test.fillna(0)


## Linear regression 

In [13]:
lr=LinearRegression()
lr=LinearRegression()
lr.fit(X_train,df_y)
y=lr.predict(X_test)

In [17]:
from sklearn.model_selection import cross_val_score
a=cross_val_score(lr,X_train,df_y,cv=10)
a.mean()

##  Tuning of XGBOOST

In [287]:
model = XGBRegressor()



  if getattr(data, 'base', None) is not None and \


In [366]:
params={
    "learning_rate"     : [0.05,0.1,0.15,0.2,0.3],
    "max_depth"         : [3,4,5,6,7,8,9,10],
    "min_child_weight"  : [2,2.25,2.5],
    "gamma"             : [0.05,0.1,0.15],
    "colsample_bytree"  : [0.8,0.9,0.95]
}

In [367]:
from sklearn.model_selection import RandomizedSearchCV
rc=RandomizedSearchCV(model,params,n_iter=5,n_jobs=-1,cv=5,verbose=3)
rc.fit(X_train,df_y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    4.7s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    5.0s finished




RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_st...
                                          seed=None, silent=None, subsample=1,
                                          verbosity=1),
                   iid='warn', n_iter=5, n_jobs=-1,
  

In [368]:
rc.best_params_

{'min_child_weight': 2.5,
 'max_depth': 8,
 'learning_rate': 0.1,
 'gamma': 0.1,
 'colsample_bytree': 0.9}

In [369]:
rc.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, gamma=0.1,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=2.5, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [371]:
model=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, gamma=0.1,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=8, min_child_weight=2.5, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)
model.fit(X_train,df_y)
y=model.predict(X_test)



## Exporting submission file

In [282]:
submission = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": np.exp(y)
    })
submission.to_csv('submission5.csv', index=False)