In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
from matplotlib import pyplot as plt
from matplotlib import rcParams as rcP

In [None]:
df = pd.read_csv('../input/pune-house-data/Pune house data.csv')
df.head()

In [None]:
# Exploring the dataset
df.shape

In [None]:
# Exploring the dataset
df.groupby('area_type')['area_type'].agg('count')

In [None]:
# Exploring the dataset
df.groupby('availability')['availability'].agg('count')

In [None]:
# Exploring the dataset
df.groupby('size')['size'].agg('count')

In [None]:
# Exploring the dataset
df.groupby('site_location')['site_location'].agg('count')

In [None]:
# Removing the columns of society
df = df.drop('society', axis='columns')
df.head()

** Data Cleaning Process**

In [None]:
# Data Cleaning
# Checking the null values in the dataset
df.isnull().sum()

In [None]:
# Applying median to the balcony and bath column
from math import floor

balcony_median = float(floor(df.balcony.median()))
bath_median = float(floor(df.bath.median()))

df.balcony = df.balcony.fillna(balcony_median)
df.bath = df.bath.fillna(bath_median)

# Checking the null values in the dataset again
df.isnull().sum()

In [None]:
# Dropping the rows with null values because the dataset is huge as compared to null values.
df = df.dropna()
df.isnull().sum()

In [None]:
# Converting the size column to bhk
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df = df.drop('size', axis='columns')
df.groupby('bhk')['bhk'].agg('count')

In [None]:

# Since the total_sqft contains range values such as 1133-1384, lets filter out these values
def isFloat(x):
    try:
        float(x)
    except:
        return False
    return True

# Displaying all the rows that are not integers
df[~df['total_sqft'].apply(isFloat)]

In [None]:
# Converting the range values to integer values and removing other types of error
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    
df['new_total_sqft'] = df.total_sqft.apply(convert_sqft_to_num)
df = df.drop('total_sqft', axis='columns')
df.head()

In [None]:
# Removing the rows in new_total_sqft column that hase None values
df.isna().sum()

In [None]:
df = df.dropna()
df.isnull().sum()

**Feature Engineering**

In [None]:
# Adding a new column of price_per_sqft
df1 = df.copy()

# In our dataset the price column is in Lakhs
df1['price_per_sqft'] = (df1['price']*100000)/df1['new_total_sqft']
df1.head()

In [None]:
# Checking unique values of 'location' column
locations = list(df['site_location'].unique())
print(len(locations))

In [None]:
# Removing the extra spaces at the end
df1.site_location = df1.site_location.apply(lambda x: x.strip())

# Calulating all the unqiue values in 'site_location' column
location_stats = df1.groupby('site_location')['site_location'].agg('count').sort_values(ascending=False)
location_stats

In [None]:

# Checking locations with less than 10 values
print(len(location_stats[location_stats<=10]), len(df1.site_location.unique()))

In [None]:
# Labelling the locations with less than or equal to 10 occurences to 'other'
locations_less_than_10 = location_stats[location_stats<=10]

df1.site_location = df1.site_location.apply(lambda x: 'other' if x in locations_less_than_10 else x)
len(df1.site_location.unique())

In [None]:

# Checking the unique values in 'availability column'
df1.groupby('availability')['availability'].agg('count').sort_values(ascending=False)

In [None]:
# Labelling the dates into Not Ready
dates = df1.groupby('availability')['availability'].agg('count').sort_values(ascending=False)

dates_not_ready = dates[dates<10000]
df1.availability = df1.availability.apply(lambda x: 'Not Ready' if x in dates_not_ready else x)

len(df1.availability.unique())

In [None]:
# Checking the unique values in 'area_type' column
df1.groupby('area_type')['area_type'].agg('count').sort_values(ascending=False)

# Since the column has only few unique values, we don't perform any operation

In [None]:
df2= df1.copy()
df2= df2.drop('price_per_sqft', axis='columns')

In [None]:
# Converting the categorical_value into numerical_values using get_dummies method
dummy_cols = pd.get_dummies(df2.site_location)
df2 = pd.concat([df2,dummy_cols], axis='columns')

In [None]:
# Converting the categorical_value into numerical_values using get_dummies method
dummy_cols = pd.get_dummies(df2.availability).drop('Not Ready', axis='columns')
df2 = pd.concat([df2,dummy_cols], axis='columns')

In [None]:
# Converting the categorical_value into numerical_values using get_dummies method
dummy_cols = pd.get_dummies(df2.area_type).drop('Super built-up  Area', axis='columns')
df2 = pd.concat([df2,dummy_cols], axis='columns')

In [None]:

df2.drop(['area_type','availability','site_location'], axis='columns', inplace=True)
df2.head(10)

# will release second notebook building function using all algorithms upvote it if you like it