In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#read dataset
df=pd.read_csv('F:\Machine Learning\Projects\houseRent/housing_train.csv')
df.head()

In [None]:
df.shape

In [None]:
#to check if any missing value is present or not
df.isnull().values.any()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
## remove those rows wherever we have lat & long as missing value

df.dropna(axis='index',how='all', subset=['lat','long'],inplace=True)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
### now we have huge no. of missing values in laundry_options & parking_options

In [None]:
df.dtypes

#### how to deal with missing values of laundry_options & parking_options
    1.Drop those rows wherever we have missing values, but thats not a professional approach
    2.Fill missing values with Statistical Approaches like Mean , Median & Mode (In case of categorical),but if we have more missing values it will affect distribution of data
    3.Professional way is -- fill missing values in such a way that it did not affect distribution of that feature, ie using some advanced approaches like Random Value Imputation

#### showing u 2nd approach

In [None]:
data=df.copy()

In [None]:
data['laundry_options'].isnull().sum()

In [None]:
data['laundry_options'].value_counts().plot(kind='bar')

In [None]:
data['laundry_options'].value_counts()

In [None]:
data['laundry_options'].mode()[0]

In [None]:
data['laundry_options'].fillna('w/d in unit',inplace=True)

In [None]:
data['laundry_options'].value_counts()

In [None]:
data['laundry_options'].value_counts().plot(kind='bar')

#### it means we cant use this approach, as it will impact distribution of data, lets use some smart approaches

### fill NA value using Random Value Imputation

#### Random Sample Imputation

Aim: Random sample imputation consists of taking random observation from the dataset and we use this observation to replace the nan values

When should it be used? 
It assumes that the data are missing completely at random(MCAR)

In [None]:
#To fetch a random sample

In [None]:
df['laundry_options'].dropna().sample()

In [None]:
df['laundry_options'].isnull().sum()

In [None]:
## considering sample of size 54127
random_sample=df['laundry_options'].dropna().sample(54127)
## random_sample=df['laundry_options'].dropna().sample(df['laundry_options'].isnull().sum())
random_sample

In [None]:
random_sample.index

In [None]:
df[df['laundry_options'].isnull()].index

In [None]:
random_sample.index=df[df['laundry_options'].isnull()].index

In [None]:
random_sample.index

In [None]:
random_sample

###  means 'laundry_options' column me jis jis index par null hai,us us jagah random_sample ki values assign kr do

In [None]:
df.loc[df['laundry_options'].isnull(),'laundry_options']=random_sample

In [None]:
df['laundry_options'].value_counts().plot(kind='bar')

In [None]:
#### Automate above stuffs
def impute_nan(df,variable):
    ##It will have the random sample to fill the na

    random_sample=df[variable].dropna().sample(df[variable].isnull().sum())
    
    ##pandas need to have equal index ie 177 in order to merge the dataset
    random_sample.index=df[df[variable].isnull()].index
    df.loc[df[variable].isnull(),variable]=random_sample


#### impute NA Values of parking_options

In [None]:
df['parking_options'].isnull().sum()

In [None]:
df['parking_options'].value_counts()/len(df)*100

In [None]:
## initial ratio between off-street parking & Carport
33/10

In [None]:
## imputing NaNs of parking_options
impute_nan(df,'parking_options')

In [None]:
## After imputing NaNs, checking the ratio between off-street parking & Carport
52/16

In [None]:
df['parking_options'].value_counts()/len(df)*100

In [None]:
df.isnull().sum()

In [None]:
#### now we doesn't have any missing values in our data

In [None]:
df.head()

In [None]:
df.columns

#### Perform Text Analysis on Description feature

In [None]:
# importing all necessery modules 
from wordcloud import WordCloud, STOPWORDS 

In [None]:
total_description = '' 

In [None]:
stopwords = set(STOPWORDS) 

In [None]:
## takes 1 min if considered first 10000 rows , if entire data it will consume your more time
# iterate through the csv file 
for val in df['description'][0:10000]:
    # typecaste each val to string 
    val = str(val) 
    
    # split the value 
    tokens = val.split() 
    
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
    total_description =total_description + " ".join(tokens)+" "

In [None]:
### Alternative to collect entire data of description_text

### ' '.join(df['description'][0:10000])


In [None]:
### generating your WordCloud
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(total_description) 

In [None]:
# plot the WordCloud image                        
plt.figure(figsize = (8, 8)) 
plt.imshow(wordcloud) 
plt.axis("off") 


In [None]:
#### users basically emphazise on swimming pool , teenis court & pet friendly , fitness court

In [None]:
### but it is hard to estimate over here which words wins by how much?? for this u can use plots/charts

In [None]:
from nltk.corpus import RegexpTokenizer as regextoken

In [None]:
df['description'].isnull().sum()

In [None]:
df['description'].fillna('no description',inplace=True)

In [None]:
df['description'].isnull().sum()

In [None]:
# Converting all the text to lowercase
df['description'] = df['description'].apply(lambda x: x.lower())

In [None]:
##  Creating a regular expression tokenizer that have only alphabets , ie remove all the special characters
# This will return separate words (tokens) from the text in the form of list
tokenizer = regextoken("[a-zA-Z]+") 

In [None]:
tokenizer

In [None]:
df['description'][0]

In [None]:
print(tokenizer.tokenize(df['description'][0]))

In [None]:
sample=df.sample(10000)

In [None]:
sample.head()

In [None]:
## takes 1 mins
# Applying the tokenizer to each row of the reviews
sample_tokens = sample['description'].apply(tokenizer.tokenize)

In [None]:
sample_tokens.index

In [None]:
# Examining the tokens created for the first row / restaurant
print(sample_tokens[123092])

In [None]:
### now from this above list,we will figure out we have some stopwords, it means we have to remove these stopwords like an,and,it etc

In [None]:
from nltk.corpus import stopwords

In [None]:
# These are common words defined by Python developers that typically don't add meaning to the text and can be removed
stop = stopwords.words("english")
print(stop)

In [None]:
### with respect to very first row, how to remove stopwords
rev=sample_tokens[123092]
print(rev)

In [None]:
print([token for token in rev if token not in stop])

In [None]:
len(sample_tokens)

In [None]:
## takes 3 mins
## remove stopwords from my entire data
##sample_tokens = sample_tokens.apply(lambda x: [token for token in x if token not in stop])

In [None]:
### using function

In [None]:
def remove_stopwords(text):
    updated_text=[token for token in text if token not in stop]
    return updated_text

In [None]:

sample_tokens=sample_tokens.apply(remove_stopwords)

In [None]:
type(sample_tokens)

In [None]:
len(sample_tokens)

In [None]:
indices=[i for i in range(0,10000)]

In [None]:
rev=pd.Series(data=sample_tokens.values,index=indices)
rev

In [None]:
# Concatenating all the reviews as I have to count frequency of each word as I have to plot which word has highest count
all_reviews = sample_tokens.astype(str).str.cat()


In [None]:
type(all_reviews)

In [None]:
### now I have to convert this string into list as u will figure out , data inside is in the form of list actually bcz our main
### goal is to compute frequency of each word,so to obtain your goal, very first u have to convert your data in the form of list

In [None]:
len(all_reviews)

In [None]:
## takes 1 min
## perform tokenization to convert your string(all_reviews) into list,so that we will count frequency of words
cleaned_reviews = tokenizer.tokenize(all_reviews)

In [None]:
len(cleaned_reviews)

In [None]:
type(cleaned_reviews)

In [None]:
# obtain the frequency of individual words in the reviews, for this u have to use FreqDist

In [None]:
from nltk import FreqDist, bigrams, trigrams

In [None]:
fd = FreqDist()

In [None]:
## takes 1 min
## checkout documentation by pressing Shift+Tab
for word in cleaned_reviews:
    fd[word]=fd[word]+ 1

In [None]:
# Examining the top 5 most frequent words
fd.most_common(5)

In [None]:
## takes 1 min
# Plotting the top 50 most frequent words
plt.figure(figsize = (15, 8))
fd.plot(20)

In [None]:
Bi-grams

In [None]:
from nltk import bigrams

In [None]:
# Generating bigrams from the reviews
bigrams = bigrams(cleaned_reviews)

In [None]:
## takes 
# Getting the bigram frequency distribution
fd_bigrams = FreqDist()
for bigram in bigrams:
    fd_bigrams[bigram]=fd_bigrams[bigram] + 1
# Examining the top 5 most frequent bigrams
fd_bigrams.most_common(5)

In [None]:
# Plotting the top 50 most frequent bigrams
plt.figure(figsize = (15, 8))
fd_bigrams.plot(50)


In [None]:
from nltk import trigrams

In [None]:
# Generating trigrams from the reviews
trigrams = trigrams(cleaned_reviews)

In [None]:
## takes 
fd_trigrams = FreqDist()
for trigram in trigrams:
    fd_trigrams[trigram] += 1

In [None]:
fd_trigrams.most_common(5)

In [None]:
plt.figure(figsize = (10, 5))
fd_trigrams.plot(50)


### perform Spatial Analysis to get a clear cut of where exactly higher priced houses are situated

In [None]:
import folium

In [None]:
from folium.plugins import HeatMap

In [None]:
# Create map with overall cases registered
m = folium.Map(zoom_start=2)
m

In [None]:
HeatMap(data=df[['lat', 'long','price']], radius=15).add_to(m)
# Show the map
m

In [None]:
df.columns

### Analyse Label distribution of data

In [None]:
list=['beds',
       'baths', 'cats_allowed', 'dogs_allowed', 'smoking_allowed',
       'wheelchair_access', 'electric_vehicle_charge', 'comes_furnished']

In [None]:
def label_distribution(feature):
    return sns.countplot(df[feature])

In [None]:
for i in list:
    #in this case,we have to first mention figure and then draw distribution
    plt.figure(figsize=(15,5))
    label_distribution(i)

In [None]:
df.columns

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(df[df['dogs_allowed']==0]['price'],hist=False,label="Price where pets are not allowed")
sns.distplot(df[df['dogs_allowed']==1]['price'],hist=False,label="Price where  pets are allowed")
plt.legend()
plt.title("Income Distribution")

### dealing with outliers from data

In [None]:
imp_features=['price',
 'sqfeet',
 'beds',
 'baths']

#### Detect outliers using BoxPlot Approach

In [None]:
sns.boxplot(df['price'])

In [None]:
sns.stripplot(df['price'])
sns.boxplot(df['price'])

In [None]:

for feature in imp_features:
    plt.figure()
    sns.stripplot(df[feature])
    sns.boxplot(df[feature])

#### Detect Outliers using some statistical approaches

In [None]:
### using Q-Q plot, we will figure out whether we have outliers in our data or not

In [None]:
import statsmodels.api as sm 

In [None]:
sm.qqplot(df['price'],line='45')

In [None]:
## Automating stuffs
import statsmodels.api as sm 
def qq_plots(df,col):
    plt.figure(figsize=(10, 4))
    sm.qqplot(df[col],line='45')
    plt.title("Normal QQPlot of {} ".format(col))
    


In [None]:
for feature in imp_features:    
    qq_plots(df,feature)

### WHAT NEXT??
    After detecting the outlier we should remove\treat the outlier 
    Outliers badly affect mean and standard deviation of the dataset. 
    It increases the error variance and reduces the power of statistical tests.
    Most machine learning algorithms do not work well in the presence of outlier. So it is desirable to detect and remove outliers.
    With all these reasons we must be careful about outlier and treat them before build a ML model. 
    There are some techniques used to deal with outliers.
     1. Deleting observations but thts not a professional approach,as in this case there is a information loss in our data.
        We delete outlier values if it is due to data entry error..
        
     2. Transforming values.
        Transforming variables can also eliminate outliers. These transformed values reduces the variation caused by extreme values.
        1. Scaling
        2. Log transformation
        3. Cube Root Normalization
        4. Box-Cox transformation

        * These techniques convert higher values of data to smaller values.
        * If the data has to many extreme values or skewed, this method helps to make your data normal.
        * But These technique not always give you the best results.
        * There is no lose of data from these methods.
        * In all these method boxcox transformation gives the best result.   
        
        
     3. Imputation by using some statistical techniques to deal with outliers like Median , Z-Score , IQR , Robust Z-score
  

### Imputing Outliers using Statistical techmiques

In [None]:
df.shape

In [None]:
data=df.copy()

In [None]:
data['price'].nlargest(400)

In [None]:
data['price'].median()

In [None]:
data['price'].mean()

In [None]:
### where-ever price is >7000 replace it with median bcz median doesnt gets affected with outliers
data['price']=np.where(data['price']>5000,data['price'].median(),data['price'])

In [None]:
### Automate stuffs using function 
def deal_with_outliers(feature,threshold):
    data[feature]=np.where(data[feature]>threshold,data[feature].median(),data[feature])

In [None]:
data['price'].mean()

In [None]:
data['price'].median()

In [None]:
##distrbution of price before Dealing with outliers
sns.distplot(df['price'])

In [None]:
## as it is almost Normally Distributed data, this data is suitable for your ML algo
sns.distplot(data['price'])

In [None]:
#### little bit right skewed

In [None]:
#as this is a right skewed,so we can perform log normal distribution

### Now for sqfeet 

In [None]:
data['sqfeet'].nlargest(200)

In [None]:
deal_with_outliers('sqfeet',5000)

In [None]:
sns.distplot(df['sqfeet'])

In [None]:
sns.distplot(data['sqfeet'])

In [None]:
#now for beds

In [None]:
deal_with_outliers('beds',999)

In [None]:
sns.boxplot(df['beds'])

In [None]:
sns.boxplot(data['beds'])

In [None]:
data['baths'].nlargest(50)

In [None]:
## before dealing with outliers
sns.boxplot(df['baths'])

In [None]:
## imputing your outliers
deal_with_outliers('baths',10)

In [None]:
## after dealing with outliers
sns.boxplot(data['baths'])

In [None]:
#now getting distribution of Each features
for feature in imp_features:
    plt.figure()#in this case,we have to first mention figure and then draw distribution
    sns.distplot(data[feature])

### Analyse Distribution of price where pets are allowed & pets are not allowed

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(data[data['dogs_allowed']==0]['price'],hist=False,label="Price where pets are not allowed")
sns.distplot(data[data['dogs_allowed']==1]['price'],hist=False,label="Price where  pets are allowed")
plt.legend()
plt.title("Income Distribution")

### Automate above stuffs

In [None]:
def price_distribution(feature,label):
    plt.figure(figsize=(12,8))
    sns.distplot(data[data[feature]==0]['price'],hist=False,label="Price where {} are not allowed".format(label))
    sns.distplot(data[data[feature]==1]['price'],hist=False,label="Price where {} are allowed".format(label))
    plt.legend()
    plt.title("Income Distribution")

In [None]:
price_distribution('cats_allowed','pets')

#### From the above plot we could say that price of house is not varying in case of having pets or not !

In [None]:
df.columns

In [None]:
df['electric_vehicle_charge'].unique()

In [None]:
price_distribution('comes_furnished','wheel chair')

#### From the above plot we could say that price of house is not varying in case of furnished & not furnished houses!

In [None]:
price_distribution('electric_vehicle_charge','electric_vehicle_charge')

#### From the above plot we could say that price of house is higher if it has a facility of charging electric_vehicle

#### Relationship between area & Price

In [None]:
import plotly.express as px
fig=px.scatter(data, x="price", y="sqfeet")
fig.show()

In [None]:
#### There is a complex relationship between Price & sqfeet it means your Linear Regression algo doesnt perform better
### u have to use some ensemble algos to predict price that will definitely perform better !

In [None]:
data.corr()

In [None]:
## highlighting results

data.corr().style.background_gradient(cmap='Reds')

In [None]:
##Higher Co-relation group

## sqfeet-- beds
## sqfeet-- baths
## dogs_allowed-- cats_allowed 

## it means we can drop beds, baths, cats_allowed

In [None]:
data.columns

In [None]:
dataframe=data.copy()

In [None]:
dataframe.drop(['id','url','region_url','beds','baths','cats_allowed','image_url','description','lat','long'],axis=1,inplace=True)

In [None]:
dataframe.dtypes

## get all the categorical features

In [None]:
cat_features=[feature for feature in dataframe.columns if data[feature].dtype=='O']
cat_features

### check all the sub-categories in categorical features to check wht encoding technique we can apply


In [None]:
for feature in cat_features:
    print('total diff features in {} are {}'.format(feature,len(df[feature].unique())))

In [None]:
dataframe.shape

In [None]:
region_count=dataframe['region'].value_counts()
region_count

In [None]:
pd.set_option('display.max_rows',298)

In [None]:
region_count=dataframe['region'].value_counts()
region_count

In [None]:
### from above stats we will figure out location less than 500 count are higher in number , it means we will ignore these 
### entries as they are very less in number

In [None]:
len(region_count[region_count>500])

In [None]:
### now instead of 298 categories , we will consider only 141 locations

In [None]:
important=region_count[region_count>500].index
important

In [None]:
def remove(x):
    if x not in important:
        return 'other'
    else:
        return x
    
## alternative using lambda
## dataframe['region']=dataframe['region'].apply(lambda x:'other' if x not in code else x)

In [None]:
dataframe['region'].tail(100)

In [None]:
dataframe['region']=dataframe['region'].apply(remove)

In [None]:
dataframe['region'].tail(50)

In [None]:
len(dataframe['region'].unique())

In [None]:
#df['region']=np.where(df['region'].isin(important),df['region'],'other')   #it means
#if df[feature] is in code then we consider it otherwise we assign it a new label'other'

### lets Automate above stuffs

In [None]:
def get_stats(feature):
    count=dataframe[feature].value_counts()
    pd.set_option('display.max_rows',df[feature].nunique())
    return count

In [None]:
get_stats('state')

In [None]:
dataframe.shape

In [None]:
def extract_imp_sub_categories(feature,threshold):
    count=dataframe[feature].value_counts()
    important=count[count>threshold].index
    return important

In [None]:
sub_cat=extract_imp_sub_categories('state',2000)

In [None]:
sub_cat

In [None]:
dataframe['state']=dataframe['state'].apply(lambda x:'other' if x not in sub_cat else x)

In [None]:
dataframe['state'].nunique()

In [None]:
get_stats('type')

In [None]:
imp2=extract_imp_sub_categories('type',3000)
imp2

In [None]:
dataframe['type']=dataframe['type'].apply(lambda x:'other' if x not in imp2 else x)

In [None]:
for feature in cat_features:
    print('total diff features in {} are {}'.format(feature,len(dataframe[feature].unique())))

### apply Frequency or count encoding on Region col as still have many sub-categories in this col

In [None]:
dictionary=dict(dataframe['region'].value_counts())
dictionary

In [None]:
dataframe['region']=dataframe['region'].map(dictionary)

In [None]:
dataframe['region']

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()

In [None]:
for feature in ['laundry_options','parking_options','type']:
    dataframe[feature]=le.fit_transform(dataframe[feature])

### perform Frequncy encoding on state feature as it has multiple sub-categories using CountEncoder class

In [None]:
##!pip install category_encoders

In [None]:
from category_encoders import CountEncoder

In [None]:
dataframe['state'].head()

In [None]:
pd.value_counts(dataframe['state'])

In [None]:
pd.set_option('display.max_rows',33)

In [None]:
pd.value_counts(dataframe['state'])

In [None]:
ce=CountEncoder()

In [None]:
dataframe['state']=ce.fit_transform(dataframe['state'])

In [None]:
dataframe['state'].head()

In [None]:
dataframe.dtypes

In [None]:
dataframe.head()

In [None]:
y=dataframe['price']
x=dataframe.drop('price',axis=1)

In [None]:

#split dataset into train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=0)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt=DecisionTreeRegressor(random_state=0)
dt.fit(x_train,y_train)



In [None]:
y_pred=dt.predict(x_test)

In [None]:
y_pred

In [None]:
#predict how our model is
from sklearn.metrics import r2_score
r2=r2_score(y_test,y_pred)
r2

In [None]:
#fit Regression models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
### classifier models
models = []
models.append(('LinearRegression', LinearRegression()))
models.append(('RandomForest', RandomForestRegressor()))
models.append(('Decision Tree', DecisionTreeRegressor()))
models.append(('KNN', KNeighborsRegressor(n_neighbors = 5)))

In [None]:
# Make predictions on x_test dataset takes 3 min

for name, model in models:
    print(name)
    model.fit(x_train, y_train)
    
    # Make predictions.
    predictions = model.predict(x_test)

    # Compute the error.
    from sklearn.metrics import r2_score
    print(r2_score(predictions, y_test))

    print('\n')

In [None]:
## RF Performs best

In [None]:
dataframe.shape

In [None]:
final=dataframe[0:10000]

In [None]:
dep=final['price']
ind=final.drop('price',axis=1)

In [None]:
#split dataset into train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(ind,dep,train_size=0.8,random_state=0)

In [None]:
## Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 3)]
# Minimum number of samples required to split a node
min_samples_split = [ 5, 15, 100]
# Minimum number of samples required at each leaf node
#min_samples_leaf = [1, 5, 10]

In [None]:
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}

In [None]:
random_grid

In [None]:
4*2*3*3

In [None]:
reg_rf=RandomForestRegressor()

In [None]:
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid, cv = 3, verbose=2, n_jobs = -1)

In [None]:
### takes more than 15 mins if entire data , takes 4 min if sample
rf_random.fit(x_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
prediction = rf_random.predict(x_test)

In [None]:
sns.distplot(y_test-prediction)

In [None]:
r2_score(y_test,prediction)