# Import Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load Dataset

In [None]:
df = pd.read_csv("../input/bengaluru-house-price-data/Bengaluru_House_Data.csv")
df

# Missing value 

## Find Null Value

In [None]:
df_missing_values = df.isnull().sum()
df_missing_values

In [None]:
df.dtypes

## Find missing numeric columns

### select numerical columns


In [None]:
df_numeric_columns = df.select_dtypes(include=["int64","float64"]).keys()
df_numeric_columns

### select missing numeric columns

In [None]:
columns_numeric_missing = [var for var in df_numeric_columns if df_missing_values[var]>0]
columns_numeric_missing

## Find missing categorical columns

### select categorical columns

In [None]:
df_categorical_columns = df.select_dtypes(include=["object"]).keys()
df_categorical_columns

### select missing categorical columns

In [None]:
columns_categorical_missing = [var for var in df_categorical_columns if df_missing_values[var]>0]
columns_categorical_missing

## Fill missing value

### make pipeline

In [None]:
numeric_value_mean_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
categorical_value_mode_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent"))])

### columns transform

In [None]:
preprocessing = ColumnTransformer(transformers=[("mean_imputer", numeric_value_mean_imputer, columns_numeric_missing),
                                                ("mode_imputer", categorical_value_mode_imputer, columns_categorical_missing)])

preprocessing

### clean missing value

In [None]:
df_clean_null_value = preprocessing.fit_transform(df)
df_clean_null_value

In [None]:
df_missing_value_solve = pd.DataFrame(df_clean_null_value, columns=columns_numeric_missing+columns_categorical_missing)
df_missing_value_solve

In [None]:
df_missing_value_solve.isnull().sum()

### update main data frame

In [None]:
df.update(df_missing_value_solve)

In [None]:
df.isnull().sum().sum()

In [None]:
df

# Delete gurbage value

## availability - feature

In [None]:
df["availability"]

In [None]:
availability_most = df["availability"].value_counts().idxmax()
availability_most

In [None]:
all = []
for i,var in enumerate(df["availability"]):
  if availability_most != var:
    #df["availability"].replace({df["availability"][i]:availability_most}, inplace=True)
    all.append(i)

df = df.drop(all)
df = df.reset_index(drop=True)
df

In [None]:
df["availability"].value_counts()

In [None]:
df

## size - feature

In [None]:
df["size"]

In [None]:
cap = []
for i,var in enumerate(df["size"]):
  if var[-3:] != 'BHK':
    cap.append(i)

df = df.drop(cap)
df = df.reset_index(drop=True)
df

In [None]:
le = LabelEncoder()

In [None]:
le.fit_transform(df["size"])

In [None]:
df["bhk"] = le.fit_transform(df["size"])
df = df.drop(["size"], axis=1)
df

In [None]:
df["bath"] = pd.to_numeric(df["bath"], downcast="float")
df["balcony"] = pd.to_numeric(df["balcony"], downcast="float")

In [None]:
df

In [None]:
df.dtypes

In [None]:
df = df.drop(["society"], axis=1)
df

## total sqrt

In [None]:
khali = []
for i,var in enumerate(df["total_sqft"]):
  try:
    df["total_sqft"][i] = float(var)
  except ValueError:
    khali.append(i)

df = df.drop(khali)
df = df.reset_index(drop=True)
df

In [None]:
df["total_sqft"] = pd.to_numeric(df["total_sqft"], downcast="float")
df

## Location - feature

In [None]:
location_all = df["location"].value_counts()
location_all

In [None]:
last_location = location_all[location_all > 20].keys()

last_location

In [None]:
save_location = last_location.value_counts().sum()
count = 0
jamp = []
loc_df = df
for i, var in enumerate(loc_df["location"]):
  for j in last_location:
    if var != j:
      count += 1
  if count == save_location:
    jamp.append(i)
  count = 0

loc_df = loc_df.drop(jamp)
loc_df = loc_df.reset_index(drop=True)
loc_df

In [None]:
df = loc_df

# Make all numeric value

In [None]:
df

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
df.isnull().sum().sum()

In [None]:
dummy_df = pd.get_dummies(df)
dummy_df

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
dummy_df

In [None]:
dummy_df

# Linear Regression - Start

## split data

In [None]:
x = dummy_df.drop("price",axis=1)
y = dummy_df["price"]

print("Shape of x: ",x.shape)
print("Shape of y: ",y.shape)

## split data - x train and y train

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=51)

print("Shape of x_train: ",x_train.shape)
print("Shape of x_test: ",x_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test: ",y_test.shape)

## Standard Scaler - feature scaling

In [None]:
sc = StandardScaler()
sc.fit(x_train)
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train

## Linear Rigression model training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(x_train, y_train)

In [None]:
lr.coef_

In [None]:
lr.intercept_

In [None]:
x_test[0,:]

In [None]:
lr.predict([x_test[0]])

In [None]:
lr.predict(x_test)

In [None]:
y_test

## Accuracy

In [None]:
lr.score(x_test, y_test)

In [None]:
lr.score(x_test, y_test) * 100