In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)


from scipy import stats
from scipy.stats import norm, skew #for some statistics


pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

In [None]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
pd.set_option("display.max_columns" , 500)

In [None]:
train.head(2)

In [None]:
plt.scatter(train['GrLivArea'] , train['SalePrice'])
plt.xlabel("GrLivArea")
plt.ylabel("SalePrice")
plt.show()

In [None]:
train =train[train['GrLivArea'] < 4500 ]
train['MiscFeature'].fillna('None' , inplace = True)

In [None]:
sns.displot(train['SalePrice'] , kde=True)
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.xticks(rotation = 45)
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

In [None]:
a = np.log1p(train['SalePrice'])
sns.displot(a , kde=True)
(mu, sigma) = norm.fit(a)
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.xticks(rotation = 45)
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(a, plot=plt)
plt.show()

In [None]:
train['SalePrice'] = a

In [None]:
train.columns[train.isna().any()]

In [None]:
train['MiscFeature'].fillna('None', inplace = True)
test['MiscFeature'].fillna('None' , inplace = True)

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)

In [None]:
all_data.shape

In [None]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})

In [None]:
missing_data.index

In [None]:
train['LotFrontage'] =  train.groupby("Neighborhood")['LotFrontage'].apply(lambda x: x.fillna(x.median()))
test['LotFrontage'] =  test.groupby("Neighborhood")['LotFrontage'].apply(lambda x: x.fillna(x.median()))

In [None]:
columns_to_fillna = ( 'PoolQC', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond' ,\
                    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2' , 'MasVnrType' )
for col in columns_to_fillna :
    train[col].fillna("None" , inplace = True)
    test[col].fillna("None" , inplace = True)

In [None]:
columns_to_fill_zero = ('GarageYrBlt' , 'GarageArea', 'GarageCars' , 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF',\
                        'BsmtFullBath', 'BsmtHalfBath','MasVnrArea')
for col in columns_to_fill_zero :
    train[col].fillna(0 , inplace = True)
    test[col].fillna(0, inplace = True)

In [None]:
train = train.drop(['Utilities'], axis=1)
test = test.drop(['Utilities'], axis=1)
train["Functional"] = train["Functional"].fillna("Typ")
test["Functional"] = test["Functional"].fillna("Typ")
train['MSSubClass'] = train['MSSubClass'].fillna("None")
test['MSSubClass'] = test['MSSubClass'].fillna("None")
fill_as_mode = ('MSZoning' , 'SaleType', 'Electrical' , 'KitchenQual' , 'Exterior1st' ,'Exterior2nd') 
for col in fill_as_mode:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])

In [None]:
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']
train['MSSubClass'] = train['MSSubClass'].apply(str)
train['OverallCond'] = train['OverallCond'].astype(str)
train['YrSold'] = train['YrSold'].astype(str)
train['MoSold'] = train['MoSold'].astype(str)
test['MSSubClass'] = test['MSSubClass'].apply(str)
test['OverallCond'] = test['OverallCond'].astype(str)
test['YrSold'] = test['YrSold'].astype(str)
test['MoSold'] = test['MoSold'].astype(str)

In [None]:
numeric_feats = train.dtypes[train.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)

In [None]:
test.head(2)

In [None]:
from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
test['SalePrice'] = 0 
for feat in skewed_features:
    train[feat] = boxcox1p(train[feat], lam)
    test[feat] = boxcox1p(test[feat], lam)
test.drop('SalePrice', axis = 1 , inplace = True )

In [None]:
from catboost import CatBoostRegressor

In [None]:
cat_features = train.dtypes[train.dtypes == "object"].index
cat_features

In [None]:

model = CatBoostRegressor(iterations=20, 
                          learning_rate=0.1,
                          cat_features = cat_features,
                          depth = 5)
# Fit model
model.fit(train.drop(['SalePrice' , 'Id'], axis = 1), train['SalePrice'])
# Get predictions
preds = model.predict(test.drop('Id', axis = 1))

In [None]:
from scipy.special import inv_boxcox
pred_inv = inv_boxcox(preds , lam) 
pred_true =np.expm1(pred_inv)

In [None]:
train['SalePrice']

In [None]:
pred_true

In [None]:
test_1 = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
test_1['SalePrice'] = pred_true

In [None]:
test_1[['Id' , 'SalePrice']].to_csv("predict_ped.csv" , index = False)