# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
# Load the datasets
train_data_path = '../input/house-price-prediction-challenge/train.csv'
test_data_path = '../input/house-price-prediction-challenge/test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Loading the Datasets
This cell loads the training and test datasets from their respective CSV files using pandas.

# Displaying Basic Information about the Datasets
This cell displays basic information about the training and test datasets, such as the number of rows, columns, and the data types of each column.

In [3]:
# Displaying basic information about the datasets
train_info = train_data.info()
test_info = test_data.info()

In [4]:
# Displaying the first few rows of the training data for a quick overview
train_head = train_data.head()
train_head


# Visualization Setup
This cell sets up the visual preferences for the seaborn library, which will be used for data visualization throughout the notebook.

# Preview of Training Data
This cell provides a quick preview of the first few rows of the training dataset, giving an initial sense of the data structure and values.

In [5]:
# Setting up the visualisation preferences
sns.set(style="whitegrid")

# Exploratory Data Analysis: Target Variable Distribution
This cell performs an exploratory data analysis (EDA) on the target variable, 'TARGET(PRICE_IN_LACS)', by plotting its distribution. The histogram helps in understanding the spread and skewness of house prices.

In [6]:
# Exploratory Data Analysis (EDA)

# 1. Target Variable Distribution
plt.figure(figsize=(12, 6))
sns.histplot(train_data['TARGET(PRICE_IN_LACS)'], kde=True, bins=50)
plt.title('Distribution of House Prices')
plt.xlabel('Price (in Lacs)')
plt.ylabel('Frequency')
plt.show()



# Exploratory Data Analysis: Numerical Features Distribution
This cell visualizes the distribution of key numerical features in the dataset, such as 'SQUARE_FT', 'LONGITUDE', and 'LATITUDE'. Histograms for these features provide insights into their range, central tendency, and spread.

In [7]:
# 2. Numerical Features Distribution
numerical_features = ['SQUARE_FT', 'LONGITUDE', 'LATITUDE']

plt.figure(figsize=(15, 5))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(1, 3, i)
    sns.histplot(train_data[feature], kde=True, bins=30)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()



# Exploratory Data Analysis: Categorical Features Distribution
This cell focuses on the distribution of various categorical features in the dataset, including 'POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', and others. Count plots for these features help in understanding their frequency and distribution in the dataset.

In [8]:
# 3. Categorical Features Distribution
categorical_features = ['POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK', 'READY_TO_MOVE', 'RESALE']

plt.figure(figsize=(15, 10))
for i, feature in enumerate(categorical_features, 1):
    plt.subplot(3, 3, i)
    sns.countplot(x=feature, data=train_data)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()



# Correlation Matrix Analysis
This cell creates a heatmap to visualize the correlation matrix of the numeric features in the dataset. The correlation matrix is crucial for identifying relationships between variables, which can inform feature selection and model building.

In [9]:
# 4. Correlation Matrix
# Selecting only numeric features for the correlation matrix
numeric_data = train_data.select_dtypes(include=['int64', 'float64'])

plt.figure(figsize=(12, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


# Data Preprocessing: Handling Missing Values
This cell deals with the identification and handling of missing values in the dataset. Various strategies such as imputation or removal of rows/columns with missing values can be applied based on the nature and extent of the missing data.

In [10]:
# Creating the Price Per Square Foot feature
# train_data['PRICE_PER_SQFT'] = train_data['TARGET(PRICE_IN_LACS)'] * 100000 / train_data['SQUARE_FT']

# Categorizing the 'SQUARE_FT' feature into size brackets
# We will define small, medium, and large based on the distribution of 'SQUARE_FT'
square_ft_quantiles = train_data['SQUARE_FT'].quantile([0.33, 0.66])
small_threshold = square_ft_quantiles[0.33]
medium_threshold = square_ft_quantiles[0.66]

def categorize_size(sqft):
    if sqft <= small_threshold:
        return 'Small'
    elif sqft <= medium_threshold:
        return 'Medium'
    else:
        return 'Large'

train_data['SIZE_CATEGORY'] = train_data['SQUARE_FT'].apply(categorize_size)

# Combining 'BHK_NO.' and 'BHK_OR_RK' into a single categorical feature
# train_data['ROOM_TYPE'] = train_data['BHK_NO.'].astype(str) + '_' + train_data['BHK_OR_RK']

# Displaying the new features
# train_data[['PRICE_PER_SQFT', 'SIZE_CATEGORY', 'ROOM_TYPE']].head()


In [11]:
# Excluding the 'ADDRESS' column and including the new features in the feature set
features = train_data.drop(['TARGET(PRICE_IN_LACS)', 'ADDRESS'], axis=1)
target = train_data['TARGET(PRICE_IN_LACS)']

In [12]:
# Identifying non-numeric (categorical) columns in the dataset
categorical_cols = features.select_dtypes(include=['object']).columns

# Displaying the categorical columns
categorical_cols


In [13]:
# from sklearn.model_selection import GridSearchCV

# # Defining the parameter grid for hyperparameter tuning
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5],
#     'min_samples_split': [2, 4, 6],
#     'min_samples_leaf': [1, 2, 3]
# }

# # Initializing the Grid Search with the Gradient Boosting Regressor and the parameter grid
# grid_search = GridSearchCV(estimator=GradientBoostingRegressor(random_state=42), 
#                            param_grid=param_grid, 
#                            cv=3, 
#                            n_jobs=-1, 
#                            verbose=2)

# # Fitting the grid search to the data
# grid_search.fit(X_train, y_train)

# # Getting the best parameters
# best_params = grid_search.best_params_

# best_params


In [14]:
# Extracting city and neighborhood from the 'ADDRESS' field
train_data['NEIGHBORHOOD'] = train_data['ADDRESS'].apply(lambda x: x.split(',')[0].strip() if ',' in x else 'Unknown')
train_data['CITY'] = train_data['ADDRESS'].apply(lambda x: x.split(',')[-1].strip() if ',' in x else 'Unknown')

# Checking for any inconsistencies or unusual patterns in the extracted data
neighborhood_counts = train_data['NEIGHBORHOOD'].value_counts()
city_counts = train_data['CITY'].value_counts()

neighborhood_counts.head(10), city_counts.head(10)  # Displaying top 10 counts for each for a quick overview


In [15]:
# Proceeding with data cleaning and standardization for the 'CITY' and 'NEIGHBORHOOD' columns
# This process includes addressing inconsistencies and standardizing the format

# Handling known inconsistencies like 'Maharashtra' being listed as a city
# We can categorize such entries as 'Other' or 'Unknown'
train_data['CITY'] = train_data['CITY'].replace({'Maharashtra': 'Other'})

# Standardizing the format: Removing leading and trailing spaces, if any
train_data['CITY'] = train_data['CITY'].str.strip()
train_data['NEIGHBORHOOD'] = train_data['NEIGHBORHOOD'].str.strip()

# Checking for any obvious inconsistencies or unusual patterns after cleaning
cleaned_city_counts = train_data['CITY'].value_counts()
cleaned_neighborhood_counts = train_data['NEIGHBORHOOD'].value_counts()

cleaned_city_counts.head(10), cleaned_neighborhood_counts.head(10)  # Displaying top 10 counts for each for a quick overview


In [16]:
# Conducting an Exploratory Data Analysis (EDA) for the new location-based features: 'CITY' and 'NEIGHBORHOOD'
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up visualization style
sns.set(style="whitegrid")

# Analyzing the distribution of house prices across different cities
plt.figure(figsize=(15, 6))
sns.boxplot(x='CITY', y='TARGET(PRICE_IN_LACS)', data=train_data)
plt.xticks(rotation=45)
plt.title('Distribution of House Prices Across Different Cities')
plt.xlabel('City')
plt.ylabel('Price (in Lacs)')
plt.show()

# Given the likely high cardinality of neighborhoods, we'll focus on the top 10 neighborhoods for a clearer view
top_neighborhoods = train_data['NEIGHBORHOOD'].value_counts().head(10).index
filtered_data = train_data[train_data['NEIGHBORHOOD'].isin(top_neighborhoods)]

plt.figure(figsize=(15, 6))
sns.boxplot(x='NEIGHBORHOOD', y='TARGET(PRICE_IN_LACS)', data=filtered_data)
plt.xticks(rotation=45)
plt.title('Distribution of House Prices in Top 10 Neighborhoods')
plt.xlabel('Neighborhood')
plt.ylabel('Price (in Lacs)')
plt.show()


In [17]:
# Extracting city and neighborhood from the 'ADDRESS' field
test_data['NEIGHBORHOOD'] = test_data['ADDRESS'].apply(lambda x: x.split(',')[0].strip() if ',' in x else 'Unknown')
test_data['CITY'] = test_data['ADDRESS'].apply(lambda x: x.split(',')[-1].strip() if ',' in x else 'Unknown')

# Checking for any inconsistencies or unusual patterns in the extracted data
neighborhood_counts = test_data['NEIGHBORHOOD'].value_counts()
city_counts = test_data['CITY'].value_counts()

neighborhood_counts.head(10), city_counts.head(10)  # Displaying top 10 counts for each for a quick overview


# Proceeding with data cleaning and standardization for the 'CITY' and 'NEIGHBORHOOD' columns
# This process includes addressing inconsistencies and standardizing the format

# Handling known inconsistencies like 'Maharashtra' being listed as a city
# We can categorize such entries as 'Other' or 'Unknown'
test_data['CITY'] = test_data['CITY'].replace({'Maharashtra': 'Other'})

# Standardizing the format: Removing leading and trailing spaces, if any
test_data['CITY'] = test_data['CITY'].str.strip()
test_data['NEIGHBORHOOD'] = test_data['NEIGHBORHOOD'].str.strip()

# Checking for any obvious inconsistencies or unusual patterns after cleaning
cleaned_city_counts = test_data['CITY'].value_counts()
cleaned_neighborhood_counts = test_data['NEIGHBORHOOD'].value_counts()


In [18]:
from category_encoders import TargetEncoder

# Assuming train_data is already loaded
city_encoder = TargetEncoder()
neighborhood_encoder = TargetEncoder()

# Basic Target Encoding
train_data['CITY_ENCODED'] = city_encoder.fit_transform(train_data['CITY'], train_data['TARGET(PRICE_IN_LACS)'])
train_data['NEIGHBORHOOD_ENCODED'] = neighborhood_encoder.fit_transform(train_data['NEIGHBORHOOD'], train_data['TARGET(PRICE_IN_LACS)'])

# Check the encoded columns
print(train_data[['CITY_ENCODED', 'NEIGHBORHOOD_ENCODED']].head())

# Apply the same encoding to the test data
# Assuming test_data is already loaded
test_data['CITY_ENCODED'] = city_encoder.transform(test_data['CITY'])
test_data['NEIGHBORHOOD_ENCODED'] = neighborhood_encoder.transform(test_data['NEIGHBORHOOD'])

# Check the encoded columns in test data
print(test_data[['CITY_ENCODED', 'NEIGHBORHOOD_ENCODED']].head())


In [19]:
# Integrating the encoded features 'CITY_ENCODED' and 'NEIGHBORHOOD_ENCODED' into the model
# We'll update the feature set and prepare the data for model training

# Dropping the original 'CITY' and 'NEIGHBORHOOD' columns
features = train_data.drop(['CITY', 'NEIGHBORHOOD', 'ADDRESS', 'TARGET(PRICE_IN_LACS)'], axis=1)

# Including the encoded city and neighborhood features
features['CITY_ENCODED'] = train_data['CITY_ENCODED']
features['NEIGHBORHOOD_ENCODED'] = train_data['NEIGHBORHOOD_ENCODED']

# Separating the target variable
target = train_data['TARGET(PRICE_IN_LACS)']

# Splitting the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(features, target, test_size=0.2, random_state=42)

# Verifying the shapes of the resulting data splits
(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)


In [20]:
# Performing one-hot encoding on the categorical columns 'POSTED_BY' and 'BHK_OR_RK'
one_hot_encoded_data = pd.get_dummies(X_train, columns=categorical_cols)

# Also applying the same encoding to the validation set to maintain consistency
one_hot_encoded_valid_data = pd.get_dummies(X_valid, columns=categorical_cols)

# Ensuring that the columns in the validation set match those in the training set
one_hot_encoded_valid_data = one_hot_encoded_valid_data.reindex(columns=one_hot_encoded_data.columns, fill_value=0)

# Verifying the transformation
one_hot_encoded_data.head(), one_hot_encoded_valid_data.head()


In [21]:
# Identifying outliers in the 'SQUARE_FT' feature using the Interquartile Range (IQR) method

# Calculating IQR
Q1 = train_data['SQUARE_FT'].quantile(0.25)
Q3 = train_data['SQUARE_FT'].quantile(0.75)
IQR = Q3 - Q1

# Defining thresholds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identifying outliers
outliers = train_data[(train_data['SQUARE_FT'] < lower_bound) | (train_data['SQUARE_FT'] > upper_bound)]

# Percentage of data that are outliers
outlier_percentage = len(outliers) / len(train_data) * 100

# Displaying the results
lower_bound, upper_bound, outlier_percentage, outliers.head()


In [22]:
# Applying a logarithmic transformation to the 'SQUARE_FT' feature to reduce the impact of outliers

# Adding a small constant to avoid issues with log(0)
constant = 1e-3
train_data['SQUARE_FT_LOG'] = np.log(train_data['SQUARE_FT'] + constant)

# Applying the same transformation to the training and validation feature sets
one_hot_encoded_data['SQUARE_FT_LOG'] = np.log(one_hot_encoded_data['SQUARE_FT'] + constant)
one_hot_encoded_valid_data['SQUARE_FT_LOG'] = np.log(one_hot_encoded_valid_data['SQUARE_FT'] + constant)

# Removing the original 'SQUARE_FT' feature to avoid redundancy
one_hot_encoded_data = one_hot_encoded_data.drop('SQUARE_FT', axis=1)
one_hot_encoded_valid_data = one_hot_encoded_valid_data.drop('SQUARE_FT', axis=1)

# Displaying the first few rows of the transformed data
one_hot_encoded_data[['SQUARE_FT_LOG']].head(), one_hot_encoded_valid_data[['SQUARE_FT_LOG']].head()


In [23]:
# Post-processing to handle negative predictions
min_price = train_data['TARGET(PRICE_IN_LACS)'].min()
print("Minimum price in training set:", min_price)

def handle_negative_predictions(predictions):
    return np.maximum(predictions, min_price)

# Example usage
# predictions = model.predict(X_test)
# corrected_predictions = handle_negative_predictions(predictions)
# Post-processing step to handle negative predictions
def handle_negative_predictions(predictions):
    min_price = train_data['TARGET(PRICE_IN_LACS)'].min()
    return np.maximum(predictions, min_price)

# Example usage
# predictions = model.predict(X_test)
# corrected_predictions = handle_negative_predictions(predictions)

# Model Training: Gradient Boosting Regressor
This cell involves the training of a Gradient Boosting Regressor model. The model is trained on the preprocessed training data, and its performance is evaluated on the validation set.

# Model Evaluation: MSE and R2 Score
This cell evaluates the performance of the trained Gradient Boosting Regressor model using metrics such as Mean Squared Error (MSE) and R2 Score. These metrics provide insights into the accuracy and predictive power of the model.

In [25]:
# Retraining the Gradient Boosting Regressor with the optimized hyperparameters

# Setting the optimized parameters
optimal_gb_model = GradientBoostingRegressor(learning_rate=0.2,
    max_depth=3,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=300,
    random_state=42)

# Training the model on the training dataset
optimal_gb_model.fit(one_hot_encoded_data, y_train)

# Predicting on the validation set
optimal_y_pred = optimal_gb_model.predict(one_hot_encoded_valid_data)
optimal_y_pred = handle_negative_predictions(optimal_y_pred)

# Calculating the performance metrics
optimal_mse = mean_squared_error(y_valid, optimal_y_pred)
optimal_r2 = r2_score(y_valid, optimal_y_pred)

(optimal_mse, optimal_r2)


In [26]:
# Creating the Price Per Square Foot feature
# test_data['PRICE_PER_SQFT'] = test_data['TARGET(PRICE_IN_LACS)'] * 100000 / test_data['SQUARE_FT']

# Categorizing the 'SQUARE_FT' feature into size brackets
# We will define small, medium, and large based on the distribution of 'SQUARE_FT'
square_ft_quantiles = test_data['SQUARE_FT'].quantile([0.33, 0.66])
small_threshold = square_ft_quantiles[0.33]
medium_threshold = square_ft_quantiles[0.66]

def categorize_size(sqft):
    if sqft <= small_threshold:
        return 'Small'
    elif sqft <= medium_threshold:
        return 'Medium'
    else:
        return 'Large'

test_data['SIZE_CATEGORY'] = test_data['SQUARE_FT'].apply(categorize_size)

# Combining 'BHK_NO.' and 'BHK_OR_RK' into a single categorical feature
# test_data['ROOM_TYPE'] = test_data['BHK_NO.'].astype(str) + '_' + test_data['BHK_OR_RK']

# Displaying the new features
# test_data[[ 'SIZE_CATEGORY', 'ROOM_TYPE']].head()


In [27]:
features_test = test_data.drop(['CITY', 'NEIGHBORHOOD', 'ADDRESS'], axis=1)
# Identifying non-numeric (categorical) columns in the dataset
categorical_cols1 = features_test.select_dtypes(include=['object']).columns

# Displaying the categorical columns
categorical_cols1


In [28]:
one_hot_encoded_data_test = pd.get_dummies(features_test, columns=categorical_cols1)

In [29]:
# Adding a small constant to avoid issues with log(0)
constant = 1e-3
one_hot_encoded_data_test['SQUARE_FT_LOG'] = np.log(one_hot_encoded_data_test['SQUARE_FT'] + constant)
one_hot_encoded_data_test = one_hot_encoded_data_test.drop('SQUARE_FT', axis=1)

# Saving Predictions to CSV
This cell demonstrates how to save the model's predictions into a CSV file. This is useful for further analysis or submission in case of a machine learning competition.

In [30]:
optimal_y_pred_test = optimal_gb_model.predict(one_hot_encoded_data_test)

optimal_y_pred_df = pd.DataFrame(optimal_y_pred, columns=['Predicted_Price'])
optimal_y_pred_df.to_csv('optimal_y_pred.csv', index=False)
print('Predictions saved to optimal_y_pred.csv')