In [None]:
# Import dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
train.head()

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(data=train, x='LotArea', y='SalePrice');

In [None]:
train['SaleCondition'].value_counts()

In [None]:
labels = ['Normal', 'Partial', 'Abnormal', 'Family', 'Alloca', 'AdjLand']

colors = ['gold', 'silver', 'red', 'maroon', 'grey', 'green']

fig, ax = plt.subplots(figsize=(10,5))
ax.pie(train['SaleCondition'].value_counts(), labels=labels, colors=colors,autopct='%1.1f%%', shadow=True,startangle=30)
plt.axis('equal')
plt.show()


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
train.head()

## Data Cleaning

In [None]:
train.isna().sum()

In [None]:
train.dtypes

In [None]:
# Create a function , to fill missing values and turn object to numerical

def preprocess_data(df):
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df[label] = content.fillna(content.median())
        
        if not pd.api.types.is_numeric_dtype(content):
            df[label] = pd.Categorical(content).codes+1
        
    return df

In [None]:
train = preprocess_data(df=train)
train.head()

In [None]:
# check again
train.isna().sum()

In [None]:
train.dtypes

## Split the dataset

In [None]:
X = train.drop('SalePrice', axis=1)
y= train['SalePrice']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)


## XGBoost

In [None]:
from xgboost import XGBRegressor

np.random.seed(42)
reg = XGBRegressor()
reg.fit(X_train, y_train)
y_preds = reg.predict(X_val)
y_preds

reg.score(X_val, y_val)

## RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
reg2 = RandomForestRegressor()
reg2.fit(X_train, y_train)
y_preds = reg2.predict(X_val)
y_preds

reg2.score(X_val, y_val)

## GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

np.random.seed(42)
reg3 = GradientBoostingRegressor()
reg3.fit(X_train, y_train)
y_preds = reg3.predict(X_val)
y_preds

reg3.score(X_val, y_val)

## CatBoostRegressor

In [None]:
from catboost import CatBoostRegressor

np.random.seed(42)
reg4 = CatBoostRegressor(verbose=0)
reg4.fit(X_train, y_train)
y_preds4 = reg4.predict(X_val)
y_preds4

print(reg4.score(X_val, y_val))

## Evaluation metrics

In [None]:
from sklearn.metrics import mean_squared_error

RMSE = mean_squared_error(y_val, y_preds4, squared=False)
RMSE

## Hyperparmeter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200]}

ideal_model = RandomizedSearchCV(CatBoostRegressor(verbose=0),
                                param_distributions=params,
                                cv=3,
                                n_iter=10,
                                verbose=True)


In [None]:
ideal_model.fit(X_train, y_train)

In [None]:
ideal_model.predict(X_val)

In [None]:
ideal_model.best_params_

In [None]:
model = CatBoostRegressor(learning_rate=0.2, l2_leaf_reg=5, iterations=1000, depth=6,border_count=32, verbose=0)
model.fit(X_train, y_train)
model.predict(X_val)

## Import Test Dataset

In [None]:
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
test.head()

## Data Cleaning

In [None]:
test = preprocess_data(df=test)
test.head()

## Modelling with best model

In [None]:
model = CatBoostRegressor(learning_rate=0.2, l2_leaf_reg=5, iterations=1000, depth=6,border_count=32, verbose=0)
model.fit(X_train, y_train)
predictions = model.predict(test)

In [None]:
predictions

In [None]:
sample = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample

In [None]:
Submission = pd.DataFrame()
Submission['Id'] = test['Id']
Submission['SalePrice'] = predictions
Submission

In [None]:
Submission.to_csv('Sample_Submission', index=False)