In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
file_path_test = "../input/house-price-prediction-challenge/test.csv"
file_path_train = "../input/house-price-prediction-challenge/train.csv"

test_data = pd.read_csv(file_path_test)
train_data = pd.read_csv(file_path_train)

print(train_data.shape)
train_data.head()

In [None]:
print(test_data.shape)
test_data.head()

In [None]:
train_data.corr()

In [None]:
train_data.dtypes

In [None]:
train_data.isnull().sum()

In [None]:
train_data.columns

In [None]:
X_full = train_data.dropna(axis = 0, subset = ["TARGET(PRICE_IN_LACS)"])
X = X_full.drop(["TARGET(PRICE_IN_LACS)"], axis = 1)
y = X_full["TARGET(PRICE_IN_LACS)"]

In [None]:
#help(train_data.dropna)

In [None]:
categorical_cols = [cname for cname in X.columns 
                    if X[cname].nunique() <= 10 and
                    X[cname].dtypes=="object"
                   ]

numerical_cols = [cname for cname in X.columns
                  if X[cname].dtypes in ["int64","float64"]
                 ]

my_cols = categorical_cols + numerical_cols
X = X[my_cols]
print(X.shape)
X_test = test_data[my_cols]
print(X_test.shape)


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

plt.figure(figsize=(14,7))
corr = X.corr()
sns.heatmap(data=corr, annot= True)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

print("Shape of X_train : ",X_train.shape)
print("Shape of X_tes : ",X_valid.shape)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
# from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

from xgboost import XGBRegressor

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
def get_scores(learning_rate, n_estimators = 250):
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', XGBRegressor(n_estimators=n_estimators, learning_rate = learning_rate))
                     ])
    scores = (-1)*cross_val_score(my_pipeline, X_train, y_train, 
                cv=5, 
                scoring = 'neg_mean_absolute_error')
    mean_score = scores.mean()
    
    return mean_score

In [None]:
results = {}
for i in range(1,11):
    results[i*.01] = get_scores(i*.01)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(results.keys(),results.values())
plt.show()

In [None]:
n_estimator_best = 250
learning_rate_best = .1

In [None]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', XGBRegressor(n_estimators=n_estimator_best, learning_rate = learning_rate_best))
                     ])

my_pipeline.fit(X,y)

In [None]:
y_pred_final = my_pipeline.predict(X_test)

In [None]:
# Save test predictions to file
output = pd.DataFrame({ 
    'TARGET(PRICE_IN_LACS)': y_pred_final})
output.to_csv('submission.csv', index=False)