# **Predicting Location of rental property**

**Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report,confusion_matrix

**Reading in data**

In [None]:
data = pd.read_csv("../input/brasilian-houses-to-rent/houses_to_rent.csv")

data.head()

**Data Cleaning**

In [None]:
print(data["animal"].value_counts())
print(data["furniture"].value_counts())

In [None]:
def data_cleanse(df):
    
    df = df.copy()
    
    # Replacing values with values that are easier to interpret and model friendly
    df["furniture"].replace({"not furnished" :"0","furnished":"1"},inplace=True)
    df["animal"].replace({"not acept":"0","acept":"1"},inplace=True)
    df["floor"].replace({"-":"0"},inplace=True)
    df["hoa"].replace({"Sem info":"0","Incluso":"0"},inplace=True)
    df["property tax"].replace({"Incluso":"0"},inplace=True)
    
    # Removing unnessecary characters
    for column in df.columns[-5:]:
        df[column] = df[column].apply(lambda x: "".join(x.split("$")[-1].split(",")))
    
    # Changing object types to numeric
    for column in df.columns:
        if df[column].dtype == "object":
            df[column] = df[column].astype("int64")
    
    # New feature detailing property tax proportion of total
    df["PctTax"] = df["property tax"] / df["total"]
    
    # Dropping unhelpful column
    df.drop(columns=["Unnamed: 0"],inplace=True)
    
    return df

In [None]:
cleaned_df = data_cleanse(data)

cleaned_df.head()

In [None]:
cleaned_df.describe()

**EDA**

In [None]:
sns.set_style("whitegrid")

# Visualizing rent distribution
sns.histplot(x=cleaned_df["rent amount"],kde=True)
plt.xlim(0,16000)
plt.title("Rent Distribution")

In [None]:
# Visualizing numeric feature distributions by location
plt.figure(figsize=(14,12))
plt.subplot(2,2,1)
sns.kdeplot(x=cleaned_df[cleaned_df["city"] == 1]["rent amount"],shade=True,label="City")
sns.kdeplot(x=cleaned_df[cleaned_df["city"] == 0]["rent amount"],shade=True,label="Out of City")
plt.xlim(0,20000)
plt.title("Rent amount by location")
plt.legend()

plt.subplot(2,2,2)
sns.kdeplot(x=cleaned_df[cleaned_df["city"] == 1]["total"],shade=True,label="City")
sns.kdeplot(x=cleaned_df[cleaned_df["city"] == 0]["total"],shade=True,label="Out of City")
plt.xlim(0,30000)
plt.title("Total amount by location")
plt.legend()

plt.subplot(2,2,3)
sns.kdeplot(x=cleaned_df[cleaned_df["city"] == 1]["fire insurance"],shade=True,label="City")
sns.kdeplot(x=cleaned_df[cleaned_df["city"] == 0]["fire insurance"],shade=True,label="Out of City")
plt.xlim(0,300)
plt.title("Fire insurance by location")
plt.legend()

plt.subplot(2,2,4)
sns.kdeplot(x=cleaned_df[cleaned_df["city"] == 1]["property tax"],shade=True,label="City")
sns.kdeplot(x=cleaned_df[cleaned_df["city"] == 0]["property tax"],shade=True,label="Out of City")
plt.xlim(0,6000)
plt.title("Property tax by location")
plt.legend()

In [None]:
# Visualizing relationship between total price and area
plt.figure(figsize=(10,8))
plt.scatter(x="total",y="area",alpha=0.4,
            edgecolors="white",data=cleaned_df[cleaned_df["city"]==1],label="City")
plt.scatter(x="total",y="area",alpha=0.4,
            edgecolors="white",data=cleaned_df[cleaned_df["city"]==0],label="Out of City")
plt.xlim(0,20000)
plt.ylim(0,1000)
plt.xlabel("Total")
plt.ylabel("Area")
plt.legend()

In [None]:
plt.figure(figsize=(12,4))

plt.subplot(1,3,1)
sns.barplot(x="city",y="PctTax",data=cleaned_df)
plt.title("Mean tax %")

plt.subplot(1,3,2)
sns.barplot(x="city",y="total",data=cleaned_df)
plt.title("Mean total")

plt.subplot(1,3,3)
sns.barplot(x="city",y="area",data=cleaned_df)
plt.title("Mean area")

plt.tight_layout()

**Modelling**

In [None]:
cleaned_df.info()

Given that I will be classifying each observation by whether it is in a city or not, we should look at how our target class is balanced:

In [None]:
y = cleaned_df["city"]

In [None]:
plt.figure(figsize=(8,8))
plt.pie(y.value_counts().values,autopct="%.1f%%",
       explode=[0,0.1],labels=["City","Not City"],
       )
plt.title("Visualizing Class Imbalance")
plt.show()

Almost 90% of our observations are within a city. This will likely negatively affect any model trained on this data, and skew our results

I will first train a model on our data in its' current form. Then, I will compare results to over and under sampled data.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Function to preprocess data
def prepro(df):
    
    scaler = MinMaxScaler(feature_range=(0,1))
    X = df.drop("city",axis=1)
    y = df["city"]
    X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
                          
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=101)
                          
    return X_train,X_test,y_train,y_test

In [None]:
X_train,X_test,y_train,y_test = prepro(cleaned_df)

In [None]:
# Function to evaluate input data with a Logistic Regression model
def model_eval(X_train,X_test,y_train,y_test,balance):
    
    logmod = LogisticRegression(solver="liblinear")
    logmod.fit(X_train, y_train)
    
    pred = logmod.predict(X_test)
    
    print(balance,":")
    cm = confusion_matrix(y_test,pred)
    cr = classification_report(y_test,pred)
    print(cr)
    sns.heatmap(cm,annot=True,vmin=0,cmap="Blues",fmt="g",cbar=False)

In [None]:
# Evaluating model performance on a Logistic Regression model
model_eval(X_train,X_test,y_train,y_test,"Unbalanced")

As we can see above, our model has an accuracy of 87%. This sounds fairly good until we see that every prediction is for one class: Our overrepresented city class. We will now try balancing classes.

**Dealing with class imbalances**

I will first use undersampling:

In [None]:
# Number of samples needed from majority class to match minority class
min_class = np.min(cleaned_df["city"].value_counts().values)

In [None]:
min_class

In [None]:
# Create list of two dataframes, one where city = 1 and one where city = 0
under_sample = []
under_sample.append(cleaned_df.query("city == 1"))
under_sample.append(cleaned_df.query("city == 0"))

# Extract equal number of samples from the list for each class
equal_samples = []
for f in under_sample:
    equal_samples.append(f.sample(min_class))
    
# Concatenate dataframes together
eqdf = pd.concat(equal_samples,axis=0)

In [None]:
eqdf = eqdf.sample(frac=1).reset_index(drop=True)

plt.figure(figsize=(8,8))
plt.pie(eqdf["city"].value_counts(),autopct="%.1f%%",explode=[0,0.03])
plt.title("Classes Balanced With Undersampling")
plt.show()

Now we have balanced classes

In [None]:
eqdf.head()

In [None]:
# Split and scale data
X_train,X_test,y_train,y_test = prepro(eqdf)

In [None]:
# Evaluating model performance on a Logistic Regression model
model_eval(X_train,X_test,y_train,y_test,"balanced undersampled")

Now let's try oversampling our minority class

In [None]:
cleaned_df["city"].value_counts()

In [None]:
oversample_min = cleaned_df[cleaned_df["city"] == 0].sample(5249,replace=True)
reg_samp = cleaned_df[cleaned_df["city"] == 1]
osdf = pd.concat([oversample_min,reg_samp],axis=0)

In [None]:
X_train,X_test,y_train,y_test = prepro(osdf)
model_eval(X_train,X_test,y_train,y_test,"balanced oversampled")

It looks as though there is very little difference in our balanced datasets, when it comes to model performance.

Now to run a series of models on our oversampled data

In [None]:
models = {
        "logmod":LogisticRegression(),
         "RFClassifier":RandomForestClassifier(),
         "KNN":KNeighborsClassifier(),
         "SVM":SVC()
         }

for key,value in models.items():
    value.fit(X_train,y_train)
    print(key+(" trained"))

In [None]:
for key,value in models.items():
    p = value.predict(X_test)
    print(key)
    print("")
    print(classification_report(y_test,p))


In [None]:
# Random Forest Confusion Matrix
rfm = RandomForestClassifier()
rfm.fit(X_train,y_train)

preds = rfm.predict(X_test)

sns.heatmap(confusion_matrix(y_test,preds),vmin=0,cbar=False,fmt="g",cmap="Blues",annot=True)