<a href="https://colab.research.google.com/github/farcryson/House_Prices_Prediction/blob/main/Adv_House_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **House Prediction Problem**
**Dataset Link: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview**

# **Importing** **Libraries**

In [None]:
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# **Importing** **Dataset**

In [None]:
dataset_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
dataset_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
dataset_train.head()

In [None]:
dataset_train.info()

In [None]:
sns.distplot(dataset_train['SalePrice'])

# **Data Cleaning**

In [None]:
df_train_clean = dataset_train.copy()
df_train_clean.drop(['Id','SalePrice'], axis=1,inplace=True)

In [None]:
df_test_clean = dataset_test.copy()
df_test_clean.drop('Id',axis=1,inplace=True)

In [None]:
 lim = 0.5*len(df_train_clean)

In [None]:
lim

# **Imputation(filling null values)**

In [None]:
for col in df_train_clean.columns:
  if(df_train_clean[col].count() < lim):
      df_train_clean.drop(col, axis =1, inplace=True)
      df_test_clean.drop(col, axis =1, inplace=True)

In [None]:
df_train_clean.info()

In [None]:
df_test_clean.info()

In [None]:
def fill(data):
  for col in data.columns:
    if data[col].dtype=='int64'or data[col].dtype=='float64':
      data[col].fillna(data[col].mean(), inplace=True)
    elif data[col].dtype=='object':
      data[col].fillna(data[col].mode().iloc[0], inplace=True)

In [None]:
fill(df_train_clean)

In [None]:
fill(df_test_clean)

In [None]:
df_train_clean.info()

In [None]:
df_test_clean.info()

# **One Hot Encoding categorical columns**

Concatinating both train and test datasets because some values in columns of train data and test data do not match. By concatinating them we will ensure that there will be accurate one hot encoded columns for both datasets.

In [None]:
mixed_df = pd.concat([df_train_clean, df_test_clean])

In [None]:
cat_col = []
for col in mixed_df.columns:
  if(mixed_df[col].dtype=='object'):
    cat_col.append(col)

In [None]:
df_encoded = pd.DataFrame(pd.get_dummies(mixed_df, columns= cat_col, drop_first=True))

In [None]:
df_encoded.head()

Now we divide them into train and test datasets

In [None]:
final_df_train = df_encoded.iloc[:1460,:]
final_df_test = df_encoded.iloc[1460:, :]

In [None]:
final_df_train

In [None]:
final_df_test

In [None]:
dataset_train.drop(['Id'],axis = 1,inplace=True)

In [None]:
num_cols = []
for col in dataset_train.columns:
    if dataset_train[col].dtype=='int64'or dataset_train[col].dtype=='float64':
        num_cols.append(col)

In [None]:
num_cols

In [None]:
dataset_train[num_cols].corr()

In [None]:
dataplot = sns.heatmap(dataset_train[num_cols].corr(), cmap = 'YlGnBu')

In [None]:
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(dataset_train[cols], height = 2.5)
plt.show();

In [None]:
X = final_df_train.values
y = dataset_train['SalePrice'].values

# **Splitting dataset into training and validation sets**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# **Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# **Training the model**

In [None]:
# import xgboost
# regressor = xgboost.XGBRegressor()
# regressor.fit(X_train, y_train)

In [None]:
# from sklearn.tree import DecisionTreeRegressor
# regressor = DecisionTreeRegressor(random_state = 0)
# regressor.fit(X_train, y_train)

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
# regressor.fit(X_train, y_train)

In [None]:
from catboost import CatBoostRegressor
regressor = CatBoostRegressor()
regressor.fit(X_train, y_train)

# **Predicting Validation set**

In [None]:
y_pred = regressor.predict(X_test)

# **Applying K-fold cross validation**

As cross_val_score doesn't contain "**log root mean squared error**". We create a custom scorer with the help of "**make_scorer**"

In [None]:
def log_RMSE(y_true, y_pred):
    log_y_true = np.log(y_true)
    log_y_pred = np.log(y_pred)
    return ( np.sum( (log_y_true - log_y_pred)**2 ) / len(log_y_true) ) ** 0.5

In [None]:
from sklearn.metrics import make_scorer

scorer = make_scorer(log_RMSE, greater_is_better = False)

In [None]:
scores = cross_val_score(regressor, X = X_train, y = y_train, cv=10, scoring=scorer)
print("Accuracy: {:.2f} %".format(scores.mean()*100))
print("Standard Deviation: {:.2f} %".format(scores.std()*100))

# **Repeating steps of train dataset**

In [None]:
test_X = final_df_test.values

In [None]:
test_X = sc.transform(test_X)

In [None]:
y_predict = regressor.predict(test_X)

# **Getting the submission file**

In [None]:
temp = pd.read_csv('sample_submission.csv')

In [None]:
pred = pd.DataFrame(y_predict)

In [None]:
output = pd.concat([temp['Id'],pred],axis=1)

In [None]:
output.columns = ['Id', 'SalePrice']

In [None]:
output

In [None]:
output.to_csv('output_catboost.csv', index= False)