# **Import the required packages**

In [None]:
#For data reading and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import matplotlib
#matplotlib.rcParams["figure.figsize"] = (20,10)

#For model building
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

#For evaluation
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# **Read and analyze data from csv file**

In [None]:
df=pd.read_csv('/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

# **Checking NaN values**

In [None]:
df.isna().sum()

# **Visualizing data**

In [None]:
sns.countplot( x='area_type', data=df, palette="icefire");

In [None]:
plt.bar(df.area_type,df.price,color=['salmon'])

In [None]:
sns.scatterplot(df['society'],df['price'])

In [None]:
corr_mat=df.corr()
sns.heatmap(corr_mat,annot=True)

# **Dropping the columns which won't influence our model much**

In [None]:
df.shape


In [None]:
df=df.drop(['area_type','availability','society'],axis=1)
df

In [None]:
df.isna().sum()

In [None]:
df=df.dropna()

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.head()

# **Creating a column named BHK**

In [None]:
df['BHK']= df['size'].apply(lambda x: int(x.split(' ')[0]))
df.head()

In [None]:
df=df.drop('size',axis=1)

In [None]:
df.shape

In [None]:
df['total_sqft'].unique()

# **Creating a function for checking float values**

In [None]:
def is_float(x):
  try:
    float(x)
  except:
    return False
  return True

#df['total_sqft']=df['total_sqft'].apply(is_float)

In [None]:
df[~df['total_sqft'].apply(is_float)].head(10)

# **Preprocessing total sq.feet column**

In [None]:
def preprocess_sqft(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
        
        

df['total_sqft']=df['total_sqft'].apply(preprocess_sqft)

In [None]:
df.shape

In [None]:
df.total_sqft.head()


In [None]:
df[~df['total_sqft'].apply(is_float)].head(10)

In [None]:
df[~df.total_sqft.notnull()]

In [None]:
df.total_sqft.isnull().sum() 

In [None]:
df=df.dropna()

In [None]:
df[~df.total_sqft.notnull()]

In [None]:
df.shape

# **Creating a column for price per sq.feet**

In [None]:
df['price_per_sqft']=df['price']*100000/df['total_sqft']
df.head()

In [None]:
df.shape

# **Preprocessing Location column**

In [None]:
df.location=df.location.apply(lambda x: x.strip())
loc_stats=df.location.value_counts(ascending=False)
loc_stats

In [None]:
len(loc_stats[loc_stats>10])

In [None]:
loc_stats_less_than_10=loc_stats[loc_stats>10]

In [None]:
df.location=df.location.apply(lambda x: 'other' if x in (loc_stats_less_than_10) else x)

In [None]:
df.location[df['location']!='other']

In [None]:
df.location.unique()

In [None]:
df[df.total_sqft/df.BHK<300].head()

In [None]:
df = df[~(df.total_sqft/df.BHK<300)]

In [None]:
df

In [None]:
df.shape

In [None]:
df.price_per_sqft.describe()

# **Removing outliers**

In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
    
df = remove_pps_outliers(df)
df.shape

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('BHK'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('BHK'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

df = remove_bhk_outliers(df)
# df8 = df7.copy()
df.shape

In [None]:


plt.subplots(figsize=(8,6))
plt.hist(df.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

In [None]:
df = df[df.bath<df.BHK+2]

In [None]:
df.shape

In [None]:
plt.hist(df.bath,bins=20)
plt.xlabel('No. of bathrooms in a house')
plt.ylabel('No. of houses')

# **Converting 'Object' values of Location into Numerical form using One Hot Encoding**

In [None]:
dummies = pd.get_dummies(df.location)
dummies

In [None]:
df=pd.concat([df,dummies.drop('other',axis='columns')],axis='columns')

In [None]:
df.head()

In [None]:
df=df.drop('location',axis='columns')

In [None]:
df.shape

In [None]:
df.head()

# **Model building: Taking input as X and Y**

In [None]:
X=df.drop("price",axis=1)
Y=df.price

# **Splitting the data into train and test sets**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

# **Applying Random Forest Regressor model**

In [None]:
rf_clf=RandomForestRegressor()
rf_clf.fit(X_train,Y_train)
rf_Y_pred=rf_clf.predict(X_test)
rf_score=rf_clf.score(X_test,Y_test)
print(rf_score)

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
np.mean(cross_val_score(rf_clf, X, Y, cv=cv))

In [None]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)
for i in range(1,40,1):
  print(f"With {i*10} estimators:")
  clf2=RandomForestRegressor(n_estimators=i*10,max_depth=i,random_state=i).fit(X_train,Y_train)
  print(f"Accuracy: {clf2.score(X_test,Y_test)*100:2f}%")

In [None]:
RF_clf2=RandomForestRegressor(n_estimators=110,max_depth=11,random_state=11)
RF_clf2.fit(X_train,Y_train)
RF2_Y_pred=RF_clf2.predict(X_test)
RF2_score=RF_clf2.score(X_test,Y_test)
RF2_score

In [None]:
cv2 = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
np.mean(cross_val_score(RF_clf2, X, Y, cv=cv2))

In [None]:
mean_squared_error(Y_test,RF2_Y_pred)

In [None]:
df1=pd.DataFrame({'Actual': Y_test,'Predicted': RF2_Y_pred })
df1.head()

In [None]:
df1.plot(figsize=(20,8),kind='line')

In [None]:
sns.regplot(x='Actual',y='Predicted',data=df1)