In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm,skew
from xgboost import XGBRegressor

In [None]:
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_train['SalePrice'].describe()

In [None]:
sns.histplot(df_train['SalePrice'])

In [None]:
sns.histplot(np.log1p(df_train['SalePrice']))

In [None]:
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])

In [None]:
corr = df_train.corr()
corr['SalePrice'].sort_values(ascending = False)

In [None]:
y = df_train['SalePrice']
test_id = df_test['Id']
all_df = pd.concat([df_train,df_test], axis=0, sort= False)
all_df.drop(['Id', 'SalePrice'], axis=1)


In [None]:
Total = all_df.isnull().sum().sort_values(ascending= False)
missing_data = pd.concat([Total], axis=1 ,keys=['Total'])
missing_data.head(30)

In [None]:
all_df.drop((missing_data[missing_data['Total'] > 5]).index, axis = 1, inplace = True)
all_df.isnull().sum().max()

In [None]:
total = all_df.isnull().sum().sort_values(ascending = False)
total.head(30)

In [None]:
numeric_missed = ['BsmtFinSF1',
                  'BsmtFinSF2',
                  'BsmtUnfSF',
                  'TotalBsmtSF',
                  'BsmtFullBath',
                  'BsmtHalfBath',
                  'GarageArea',
                  'GarageCars']
for feature in numeric_missed:
    all_df[feature] = all_df[feature].fillna(0)

In [None]:
categorical_missed = ['Exterior1st',
                  'Exterior2nd',
                  'SaleType',
                  'MSZoning',
                   'Electrical',
                     'KitchenQual']
for feature in categorical_missed:
    all_df[feature] = all_df[feature].fillna(all_df[feature].mode()[0])

In [None]:
all_df['Functional'] = all_df['Functional'].fillna('Typ')

In [None]:
all_df.drop(['Utilities'], axis = 1, inplace = True)

In [None]:
all_df.isnull().sum().max()

In [None]:
all_df = pd.get_dummies(all_df)
all_df

In [None]:
from sklearn.preprocessing import StandardScaler
Scaler = StandardScaler()

all_scaled = pd.DataFrame(Scaler.fit_transform(all_df))
train_cleaned = pd.DataFrame(all_scaled[:1460])
test_cleaned = pd.DataFrame(all_scaled[1460:])

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_cleaned,y, test_size = 0.3, random_state = 20) 

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

In [None]:
reg = LinearRegression()
reg.fit(x_train,y_train)

In [None]:
reg.score(x_test,y_test)

In [None]:
rdg = Ridge(alpha = 0.7, normalize = True)
rdg.fit(x_train,y_train)

In [None]:
rdg.score(x_test,y_test)

In [None]:
eln = ElasticNet(random_state = 3)
eln.fit(x_train,y_train)

In [None]:
eln.score(x_test,y_test)

In [None]:
y_pred_reg = pd.DataFrame(np.expm1(reg.predict(test_cleaned)))
y_pred = pd.DataFrame()
y_pred['SalePrice'] = y_pred_reg[0]
y_pred['Id'] = df_test['Id']
y_pred.to_csv('attempt1.cvs', index = False)

In [None]:
y_pred_rdg = pd.DataFrame(np.expm1(rdg.predict(test_cleaned)))
y1_pred = pd.DataFrame()
y1_pred['SalePrice'] = y_pred_rdg[0]
y1_pred['Id'] = df_test['Id']
y1_pred.to_csv('attempt2.csv', index = False)

In [None]:
y_pred_eln = pd.DataFrame(np.expm1(eln.predict(test_cleaned)))
y2_pred = pd.DataFrame()
y2_pred['SalePrice'] = y_pred_eln[0]
y2_pred['Id'] = df_test['Id']
y2_pred.to_csv('attempt3.csv', index = False)

In [None]:
import xgboost as XGB

In [None]:
model = XGB.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4603, gamma=0.0468,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=3,
             min_child_weight=1.7817, monotone_constraints='()',
             n_estimators=2200, n_jobs=4, nthread=-1, num_parallel_tree=1,
             random_state=7, reg_alpha=0.464, reg_lambda=0.8571,
             scale_pos_weight=1, subsample=0.5213,silent = True,tree_method='exact',
             validate_parameters=1, verbosity=0)
model.fit(x_train,y_train)

In [None]:
model.score(x_test,y_test)

In [None]:
y_pred_XGB = pd.DataFrame(np.expm1(model.predict(test_cleaned)))
y3_pred = pd.DataFrame()
y3_pred['SalePrice'] = y_pred_XGB[0]
y3_pred['Id'] = df_test['Id']
y3_pred.to_csv('attempt4.csv', index = False)

In [None]:
from sklearn.ensemble import RandomForestRegressor

regr = RandomForestRegressor(max_depth=15, random_state=0)
regr.fit(x_train, y_train)

In [None]:
regr.score(x_test,y_test)

In [None]:
y_pred_regr = pd.DataFrame(np.expm1(regr.predict(test_cleaned)))
y4_pred = pd.DataFrame()
y4_pred['SalePrice'] = y_pred_regr[0]
y4_pred['Id'] = df_test['Id']
y4_pred.to_csv('attempt5.csv', index = False)