In [None]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
df_train= pd.read_csv('../input/train.csv')
df_test= pd.read_csv('../input/test.csv')
#print(df_train.columns)
#print(df_test.columns)

In [None]:
plt.figure(figsize=(26,8))
sns.heatmap(df_train.corr(),annot = True)
plt.show()

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

In [None]:
df_train = df_train.drop((missing_data[missing_data['Total'] > 80]).index,1)

total_test = df_test.isnull().sum().sort_values(ascending=False)
percent_test = (df_test.isnull().sum()/df_test.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total_test, percent_test], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

df_test = df_test.drop((missing_data[missing_data['Total'] > 70]).index,1)

In [None]:
# Categorical boolean mask
categorical_feature_mask = df_train.dtypes==object
# filter categorical columns using mask and turn it into alist
categorical_cols = df_train.columns[categorical_feature_mask]
categorical_cols
categorical_feature_mask_test = df_test.dtypes==object
# filter categorical columns using mask and turn it into alist
categorical_cols_test = df_test.columns[categorical_feature_mask_test].tolist()

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df_train[categorical_cols] = df_train[categorical_cols].apply(lambda col: labelencoder.fit_transform(col.astype(str)))
df_test[categorical_cols_test] = df_test[categorical_cols_test].apply(lambda col: labelencoder.fit_transform(col.astype(str)))




In [None]:
df_train.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
#df_train['GarageYrBlt'] = df_train['GarageYrBlt'].fillna(df_train['GarageYrBlt'].mean())
df_train['MasVnrArea'] = df_train['MasVnrArea'].fillna(df_train['MasVnrArea'].mean())
#df_train['LotFrontage'] = df_train['LotFrontage'].fillna(df_train['LotFrontage'].mean())

In [None]:
#saleprice correlation matrix
k = 15 #number of variables for heatmap
plt.figure(figsize=(16,8))
corrmat = df_train.corr()
# picking the top 15 correlated features
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
df_train = df_train[cols]
cols



In [None]:
#print(df_test.isnull().sum().sort_values(ascending=False).head(20))
#print(df_test.columns)

df_test=df_test[cols.drop('SalePrice')]
print(df_test.shape)
df_test.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
#df_test['BsmtHalfBath'] = df_test['BsmtHalfBath'].fillna(df_test['BsmtHalfBath'].mean())
df_test['MasVnrArea'] = df_test['MasVnrArea'].fillna(df_test['MasVnrArea'].mean())
df_test['GarageCars'] = df_test['GarageCars'].fillna(df_test['GarageCars'].mean())
df_test['GarageArea'] = df_test['GarageArea'].fillna(df_test['GarageArea'].mean())
df_test['BsmtFinSF1'] = df_test['BsmtFinSF1'].fillna(df_test['BsmtFinSF1'].mean())
df_test['TotalBsmtSF'] = df_test['TotalBsmtSF'].fillna(df_test['TotalBsmtSF'].mean())
#df_test['BsmtUnfSF'] = df_test['BsmtUnfSF'].fillna(df_test['BsmtUnfSF'].mean())
#df_test['BsmtFinSF2'] = df_test['BsmtFinSF2'].fillna(df_test['BsmtFinSF2'].mean())
#df_test['BsmtFullBath'] = df_test['BsmtFullBath'].fillna(df_test['BsmtFullBath'].mean())


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train.drop('SalePrice', axis=1), df_train['SalePrice'], test_size=0.3, random_state=101)

In [None]:
y_train= y_train.values.reshape(-1,1)
y_test= y_test.values.reshape(-1,1)

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
y_train = sc_X.fit_transform(y_train)
y_test = sc_y.fit_transform(y_test)
y_train

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)
print(lm)
predictions = lm.predict(X_test)
predictions= predictions.reshape(-1,1)
plt.figure(figsize=(15,8))
plt.scatter(y_test,predictions)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
from sklearn import metrics
print('MSE:', metrics.mean_squared_error(y_test, predictions))


In [None]:
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)

In [None]:
clf_pred=clf.predict(X_test)
clf_pred= clf_pred.reshape(-1,1)
print('MAE:', metrics.mean_absolute_error(y_test, clf_pred))
print('MSE:', metrics.mean_squared_error(y_test, clf_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, clf_pred)))

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(y_test,clf_pred, c= 'brown')
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 500, random_state = 0)
rfr.fit(X_train, y_train)
rfr_pred= rfr.predict(X_test)
rfr_pred = rfr_pred.reshape(-1,1)

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(y_test,rfr_pred, c='orange')
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
import lightgbm as lgb
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=500,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
model_lgb.fit(X_train,y_train)

In [None]:
lgb_pred = model_lgb.predict(X_test)
lgb_pred = lgb_pred.reshape(-1,1)
plt.figure(figsize=(15,8))
plt.scatter(y_test,lgb_pred, c='orange')
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
error_rate=np.array([metrics.mean_squared_error(y_test, predictions),metrics.mean_squared_error(y_test, clf_pred),metrics.mean_squared_error(y_test, rfr_pred)])
plt.figure(figsize=(16,5))
plt.plot(error_rate)

In [None]:
a = pd.read_csv('../input/test.csv')
test_id = a['Id']
a = pd.DataFrame(test_id, columns=['Id'])
test = sc_X.fit_transform(df_test)
test.shape

In [None]:
test_prediction_clf=clf.predict(df_test)
test_prediction_clf= test_prediction_clf.reshape(-1,1)





In [None]:
test_prediction_clf
test_prediction_clf =sc_y.inverse_transform(test_prediction_clf)
test_prediction_clf = pd.DataFrame(test_prediction_clf, columns=['SalePrice'])
test_prediction_clf.head()


In [None]:
result = pd.concat([a,test_prediction_clf], axis=1)
result.to_csv('submission.csv',index=False)
