In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder

import eli5
from eli5.sklearn import PermutationImportance
from pdpbox import pdp, get_dataset, info_plots
from sklearn.model_selection import GridSearchCV

In [None]:
train_df=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df['SalePrice'].describe()

In [None]:
sns.distplot(train_df['SalePrice']);

In [None]:
print(train_df['SalePrice'].skew())
print(train_df['SalePrice'].kurt())

In [None]:
corrmatrix=train_df.corr().abs()

sns.heatmap(corrmatrix);

In [None]:
corrmatrix[corrmatrix['SalePrice']<.3].index

In [None]:
corr_var_list=['Id', 'MSSubClass', 'LotArea', 'OverallCond', 'BsmtFinSF2', 'BsmtUnfSF', 'LowQualFinSF', 
               'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
               'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

In [None]:
train_df.shape

In [None]:
data=pd.concat([train_df, test_df])

In [None]:
print(data.shape)
data.head()

In [None]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
null_feature=['PoolQC', 'MiscFeature','Alley', 'Fence', 'FireplaceQu']
data.drop(null_feature, axis=1, inplace=True)

In [None]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
data.info()

In [None]:
cols_none = ['MSZoning', 'BsmtCond','Utilities', 'BsmtExposure', 'Exterior1st', 'Exterior2nd', 'BsmtFinType1',
             'BsmtFinType2', 'BsmtQual', 'Electrical', 'GarageFinish','GarageQual', 'GarageType', 'GarageCond',
             'KitchenQual', 'SaleType', 'MasVnrType', 'Functional']

for col in cols_none:
  data[col] = data[col].fillna('None')

In [None]:
data['LotFrontage'] = data.groupby('MSZoning').LotFrontage.transform(lambda row: row.fillna(row.mean()))

In [None]:
num_col=data.dtypes[data.dtypes!='object'].index.to_list()
cat_col=data.dtypes[data.dtypes=='object'].index.to_list()

In [None]:
ordinal_encoder=OrdinalEncoder()
data[cat_col]=ordinal_encoder.fit_transform(data[cat_col])

In [None]:
my_imputer=SimpleImputer()
imputed_data=pd.DataFrame(my_imputer.fit_transform(data))

imputed_data.columns=data.columns
imputed_data.head()

In [None]:
imputed_data['Remodeling']=np.where(imputed_data['YearBuilt']==imputed_data['YearRemodAdd'], 0, 1)

In [None]:
imputed_data['RemodGap']=imputed_data['YearRemodAdd']-imputed_data['YearBuilt']
imputed_data['HouseAge']=2021-imputed_data['YearBuilt']

In [None]:
imputed_data['TotBath']=imputed_data['BsmtFullBath']+.5*imputed_data['BsmtHalfBath']+imputed_data['FullBath']+.5*imputed_data['HalfBath']
imputed_data['CarpetArea']=imputed_data['TotalBsmtSF']+imputed_data['1stFlrSF']+imputed_data['2ndFlrSF']

imputed_data['OutSideArea']=imputed_data.WoodDeckSF+ imputed_data.OpenPorchSF+imputed_data.EnclosedPorch+imputed_data['3SsnPorch']+imputed_data.ScreenPorch+imputed_data.PoolArea

In [None]:
feature=['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF']

X_cluster=pd.DataFrame(imputed_data[feature])

objective_function=[]
for i in range(1,6):
    clustering=KMeans(n_clusters=i, init='k-means++')
    clustering.fit(X_cluster)
    objective_function.append(clustering.inertia_)

plt.plot(range(1,6), objective_function)
plt.show;

In [None]:
kmeans=KMeans(n_clusters=3, random_state=0)

imputed_data['Cluster']=kmeans.fit_predict(X_cluster)

In [None]:
Cat_cluster_var=['MSZoning', 'Neighborhood', 'Foundation', 'SaleType', 'SaleCondition']
imputed_data[Cat_cluster_var].isnull().sum()

In [None]:
cat_data=pd.DataFrame(data[Cat_cluster_var])

In [None]:
housecat=[]
for i in range(1,10):
    kmodes=KModes(n_clusters=i, random_state=0)
    kmodes.fit(cat_data)
    housecat.append(kmodes.cost_)
    
plt.plot(range(1,10), housecat)
plt.show()

In [None]:
kmodes=KModes(n_clusters=4, random_state=0)

imputed_data['House_Cluster']=kmodes.fit_predict(cat_data)

In [None]:
imputed_data.drop(corr_var_list, axis=1, inplace=True)
imputed_data.head()

In [None]:
train=imputed_data.iloc[:1460, :]
test=imputed_data.iloc[1460:, :]
print(train.shape)
print(test.shape)
test.drop(['SalePrice'], axis=1, inplace=True)
print(train.shape)
print(test.shape)

In [None]:
y=train.SalePrice
X=train.drop(['SalePrice'], axis=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=1)

In [None]:
n_estimators=[100,200,250,300,400,500,600]
max_features=['auto', 'sqrt']
max_depth=[int(x) for x in np.linspace(10,120, num=12)]
max_depth.append(None)
min_samples_split=[2, 6,10]
min_samples_leaf=[1,3,5]
bootstrap=[True, False]

random_grid={'n_estimators': n_estimators, 'max_features': max_features, 'max_depth':max_depth, 'min_samples_split':min_samples_split,
             'min_samples_leaf': min_samples_leaf,'bootstrap': bootstrap}

rf=RandomForestRegressor(random_state=1)

rf_random=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, cv=5, n_iter=100, random_state=1, n_jobs=-1)

rf_random.fit(X_train, y_train)

print(rf_random.best_params_)

In [None]:
test_model_data=test.copy()
#test_model_data.drop(['SalePrice'], axis=1, inplace=True)

In [None]:
test_model = RandomForestRegressor(n_estimators=250, min_samples_split=2, min_samples_leaf=1, max_features='sqrt',
                                   max_depth=100, bootstrap=False, random_state=1)

test_model.fit(X_train, y_train)
pred=test_model.predict(X_valid)

score=mean_absolute_error(y_valid, pred)
print(score)

In [None]:
parameters = {'objective':['reg:squarederror'],'booster':['gbtree','gblinear'],'learning_rate': [0.1], 'max_depth': [7,10,15,20],
              'min_child_weight': [10,15,20,25],'colsample_bytree': [0.8, 0.9, 1],'n_estimators': [50,100,200,300,400]}

xgb=XGBRegressor(random_state=1)

xgb_random=RandomizedSearchCV(estimator=xgb, param_distributions=parameters, cv=5, n_iter=15, random_state=1, verbose=5, n_jobs=-1)

xgb_random.fit(X_train, y_train)

print(xgb_random.best_params_)

In [None]:
xgb_model=XGBRegressor(n_estimators=100, min_child_weight=20, max_depth=10, learning_rate=.1, colsample_bytree=.8, booster='gbtree')
xgb_model.fit(X_train, y_train)
pred=xgb_model.predict(X_valid)

score=mean_absolute_error(y_valid, pred)
print(score)

In [None]:
perm=PermutationImportance(test_model, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, feature_names=X_train.columns.tolist())

In [None]:
pdp_carpetarea=pdp.pdp_isolate(model=test_model, dataset=X_valid, model_features=X_valid.columns.tolist(), feature="CarpetArea" )

pdp.pdp_plot(pdp_carpetarea, "CarpetArea")
plt.show()

In [None]:
pdp_GrLivArea=pdp.pdp_isolate(model=test_model, dataset=X_valid, model_features=X_valid.columns.tolist(), feature="GrLivArea" )

pdp.pdp_plot(pdp_GrLivArea, "GrLivArea")
plt.show()

In [None]:
pdp_TotRmsAbvGrd=pdp.pdp_isolate(model=test_model, dataset=X_valid, model_features=X_valid.columns.tolist(), feature="TotRmsAbvGrd" )

pdp.pdp_plot(pdp_TotRmsAbvGrd, "TotRmsAbvGrd")
plt.show()

In [None]:
pdp_OverallQual=pdp.pdp_isolate(model=test_model, dataset=X_valid, model_features=X_valid.columns.tolist(), feature="OverallQual" )

pdp.pdp_plot(pdp_OverallQual, "OverallQual")
plt.show

In [None]:
pred_test=xgb_model.predict(test_model_data)

In [None]:
submission={'Id': test_df.Id.values, 'SalePrice': pred_test}

solution=pd.DataFrame(submission)
solution.head()

In [None]:
solution.to_csv('solution.csv', index=False)