In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/house-price-prediction-challenge/train.csv')
df.sample(15)


In [None]:
df.isnull().sum()

In [None]:
df.drop(["ADDRESS"], axis = 1, inplace=True)

In [None]:
df.describe()

In [None]:
"""from pandas_profiling import ProfileReport
profile = ProfileReport(df,title='Pandas Profiling Report',explorative=True)
profile.to_widgets()"""

In [None]:
features=['POSTED_BY','UNDER_CONSTRUCTION','RERA','BHK_OR_RK','READY_TO_MOVE','RESALE']

In [None]:
sns.set_style('white')

for feature in features:
    plt.figure(figsize=(10,7))
    sns.barplot(x=df[feature].value_counts().index, y=df[feature].value_counts().values)

    plt.ylabel('Number of Samples')
    plt.xlabel(f'feature', style = 'normal', size = 24)

    plt.xticks(rotation = 45, size = 12)
    plt.yticks(rotation = 45, size = 12)

    plt.title(f'Distribution of {feature}',color = 'black',fontsize=15)
    plt.show()

In [None]:
sns.set(style='whitegrid')



for feature in features:
    plt.figure(figsize=(10, 5))
    plt.pie(df[feature].value_counts(),labels=df[feature].value_counts().index,
            autopct='%1.2f%%',colors=['#E37383','#FFC0CB'], )
    plt.title(f'Distribution of {feature}', color = 'black',fontsize = 15)
    plt.axis('equal')
    plt.show()

In [None]:
sns.set_style('darkgrid')
for feature in features:
    plt.figure(figsize=(10, 10))
    sns.violinplot(x = feature, y = 'TARGET(PRICE_IN_LACS)', data = df)

In [None]:
sns.set_style('darkgrid')
for feature in features:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x = feature, y = 'TARGET(PRICE_IN_LACS)', data = df)

In [None]:
plt.figure(figsize=(20,15))
sns.set_style('white')
sns.distplot(df['TARGET(PRICE_IN_LACS)'], color="#ff9999").set_title('TARGET(PRICE_IN_LACS')

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(10, 5))

sns.scatterplot(x=df['TARGET(PRICE_IN_LACS)'], y=df['SQUARE_FT'],
                    hue=df['UNDER_CONSTRUCTION'], 
                    palette='tab20', 
                    linewidth=0,
                    data=df)

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(10, 5))

sns.scatterplot(x=df['TARGET(PRICE_IN_LACS)'], y=df['BHK_NO.'],
                    hue=df['UNDER_CONSTRUCTION'], 
                    palette='tab20', 
                    linewidth=0,
                    data=df)

In [None]:
sns.set_style('darkgrid')
df.plot(kind='scatter',x='LONGITUDE',y='LATITUDE',alpha=0.4,c="TARGET(PRICE_IN_LACS)",cmap=plt.get_cmap("jet"),colorbar=True)

In [None]:
plt.figure(figsize=(12,10)) 
sns.heatmap(df.corr(), annot=True, linewidths = 2)
plt.show()

In [None]:
df=pd.get_dummies(df,drop_first=True)

In [None]:
df

In [None]:
quantile1, quantile3= np.percentile(df['TARGET(PRICE_IN_LACS)'],[25,75])

In [None]:
print(quantile1,quantile3)

In [None]:

## Find the IQR

iqr=quantile3-quantile1
print(iqr)

In [None]:
## Find the lower bound value and the higher bound value

lower_bound_val = quantile1 -(1.5 * iqr) 
upper_bound_val = quantile3 +(1.5 * iqr)

In [None]:
print(lower_bound_val,upper_bound_val)

In [None]:
  df_out = df.loc[~((df['TARGET(PRICE_IN_LACS)'] > upper_bound_val ) | (df['TARGET(PRICE_IN_LACS)'] < lower_bound_val ))]

In [None]:
df_out

In [None]:
X=df_out.drop(columns=['TARGET(PRICE_IN_LACS)'])
y=df_out['TARGET(PRICE_IN_LACS)']

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
import matplotlib.pyplot as plt
model = ExtraTreesRegressor()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(9).plot(kind='barh')
plt.show()

In [None]:
X=X.drop(columns=['POSTED_BY_Owner','RERA','UNDER_CONSTRUCTION','READY_TO_MOVE','RESALE','BHK_OR_RK_RK'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor()

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
print(n_estimators)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
 #Randomized Search CV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)



In [None]:
rf = RandomForestRegressor() 
rf_random = RandomizedSearchCV(estimator = regressor, param_distributions =random_grid,scoring='neg_mean_squared_error', n_iter= 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train.values.ravel())

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_score_

In [None]:
predictions=rf_random.predict(X_test)



In [None]:
sns.distplot(y_test-predictions)

In [None]:
plt.scatter(y_test,predictions)

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
print('MAE:', metrics.mean_absolute_error(y_train, rf_random.predict(X_train)))
print('MSE:', metrics.mean_squared_error(y_train, rf_random.predict(X_train)))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, rf_random.predict(X_train))))

In [None]:
mean = np.mean(y_train)

print(f"RMSLE for predicting only 0: {round(np.sqrt(metrics.mean_squared_error(y_train, np.zeros(len(y_train)))), 5)}")
print(f"RMSLE for predicting only 1: {round(np.sqrt(metrics.mean_squared_error(y_train, np.ones(len(y_train)))), 5)}")
print(f"RMSLE for predicting the mean ({round(mean, 2)}): {round(np.sqrt(metrics.mean_squared_error(y_train, np.full(len(y_train), mean))), 5)}")

In [None]:
const_rmses = dict()
for i in range(70):
    const = i*2
    rmse = round(np.sqrt(metrics.mean_squared_error(y_train, np.full(len(y_train), const))), 5)
#     print(f"RMSLE for predicting only {const}: {rmsle}")
    const_rmses[const] = rmse

xs = list(const_rmses.keys())
ys = list(const_rmses.values())

pd.DataFrame(ys, index=xs).plot(figsize=(15, 10), legend=None)
plt.scatter(min(const_rmses, key=const_rmses.get), sorted(ys)[0], color='red')
plt.title("RMSE scores for constant predictions", fontsize=18, weight='bold')
plt.xticks(fontsize=14)
plt.xlabel("Constant", fontsize=14)
plt.ylabel("RMSE", rotation=0, fontsize=14)

In [None]:
const_rmses = dict()
for i in range(30,37):
    const = i*2
    rmse = round(np.sqrt(metrics.mean_squared_error(y_train, np.full(len(y_train), const))), 5)
#     print(f"RMSLE for predicting only {const}: {rmsle}")
    const_rmses[const] = rmse

xs = list(const_rmses.keys())
ys = list(const_rmses.values())

pd.DataFrame(ys, index=xs).plot(figsize=(15, 10), legend=None)
plt.scatter(min(const_rmses, key=const_rmses.get), sorted(ys)[0], color='red')
plt.title("RMSE scores for constant predictions", fontsize=18, weight='bold')
plt.xticks(fontsize=14)
plt.xlabel("Constant", fontsize=14)
plt.ylabel("RMSE", rotation=0, fontsize=14);

its found that best rmse constant value is 66

In [None]:
best_const=66

In [None]:
print(f"RMSE for predicting the best possible constant on our data: {round(np.sqrt(metrics.mean_squared_error(y_train, np.full(len(y_train), best_const))), 5)}\n")

This is the optimal RMSE score that we can get with only a constant prediction and using all data available.
We therefore call it the best 'Naive baseline'
A model should at least perform better than this RMSE score.

In [None]:
test_df=df = pd.read_csv('/kaggle/input/house-price-prediction-challenge/test.csv')
test_df.drop(columns=['ADDRESS'],inplace=True)

In [None]:
test_df=pd.get_dummies(test_df,drop_first=True)

In [None]:
test_df

In [None]:
test_df.drop(columns=['POSTED_BY_Owner','RERA','UNDER_CONSTRUCTION','READY_TO_MOVE','RESALE','BHK_OR_RK_RK'], axis = 1, inplace=True)

In [None]:
predictions_new=pd.DataFrame(rf_random.predict(test_df))

In [None]:
predictions_new.to_csv('submission_rf.csv', index =False)

In [None]:
import pickle

In [None]:
# open a file, where you ant to store the data
file = open('rf_regression_model.pkl', 'wb')

pickle.dump(rf_random, file)

In [None]:
"""with open('rf_regression_model.pkl', 'rb') as file:
    rf_random= pickle.load(file)"""