In [2]:
# Importing Libraries : 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [4]:
# Load dataset 
df = pd.read_csv('houses_to_rent.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,0,1,240,3,3,4,-,acept,furnished,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,1,0,64,2,1,1,10,acept,not furnished,R$540,R$820,R$122,R$11,"R$1,493"
2,2,1,443,5,5,4,3,acept,furnished,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"


In [10]:
# Let's drop Unnamed:0 because we have twice index. 
df = df.drop('Unnamed: 0', axis=1)

In [12]:
# Understanding the basic overview of data : 
print(f"No.of Rows = {df.shape[0]}, No of Columns = {df.shape[1]}")
print(f"Basic Information of the columns :\n {df.info()}")
print(f"Descriptive statistics of numerical columns :\n {df.describe()}")
print(f"Null values : {df.isnull().sum()}")
print(f"Duplicated values : {df.duplicated().sum()}")

No.of Rows = 6080, No of Columns = 13
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6080 entries, 0 to 6079
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   city            6080 non-null   int64 
 1   area            6080 non-null   int64 
 2   rooms           6080 non-null   int64 
 3   bathroom        6080 non-null   int64 
 4   parking spaces  6080 non-null   int64 
 5   floor           6080 non-null   object
 6   animal          6080 non-null   object
 7   furniture       6080 non-null   object
 8   hoa             6080 non-null   object
 9   rent amount     6080 non-null   object
 10  property tax    6080 non-null   object
 11  fire insurance  6080 non-null   object
 12  total           6080 non-null   object
dtypes: int64(5), object(8)
memory usage: 617.6+ KB
Basic Information of the columns :
 None
Descriptive statistics of numerical columns :
               city          area        rooms     bathr

###### From the basic information of the columns,we can conclude that we have to do
###### a lot of cleaning.
###### Some of the object columns has to be converted to numerical columns.
###### Encoding has to be done 
###### Dealing with duplicates


In [18]:
# Dealing with Duplicates : 
df = df.drop_duplicates()

In [20]:
df.shape

(5882, 13)

In [22]:
df.duplicated().sum()

0

In [25]:
# Encoding Categorical variables : animal	furniture
df['animal'].value_counts()

animal
acept        4535
not acept    1347
Name: count, dtype: int64

In [29]:
# Encoding animal : acept to 1, not accept to 0. 
df['animal'] = df['animal'].replace({'acept':1, 'not acept':0})

In [33]:
df['animal'].dtype

dtype('int64')

In [35]:
# Encoding Furniture : 
df['furniture'].value_counts()

furniture
not furnished    4349
furnished        1533
Name: count, dtype: int64

In [39]:
# Encoding furniture : furnished  to 1, not furnished to 0 
df['furniture'] = df['furniture'].replace({'furnished':1, 'not furnished':0})

In [41]:
df['furniture'].dtype

dtype('int64')

In [43]:
df

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,-,1,1,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,0,64,2,1,1,10,1,0,R$540,R$820,R$122,R$11,"R$1,493"
2,1,443,5,5,4,3,1,1,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"
3,1,73,2,2,1,12,1,0,R$700,"R$1,250",R$150,R$16,"R$2,116"
4,1,19,1,1,0,-,0,0,R$0,"R$1,200",R$41,R$16,"R$1,257"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,1,50,2,1,1,2,1,0,R$420,"R$1,150",R$0,R$15,"R$1,585"
6076,1,84,2,2,1,16,0,1,R$768,"R$2,900",R$63,R$37,"R$3,768"
6077,0,48,1,1,0,13,1,0,R$250,R$950,R$42,R$13,"R$1,255"
6078,1,160,3,2,2,-,0,0,R$0,"R$3,500",R$250,R$53,"R$3,803"


In [47]:
# We can see that floor column have '-' representing the floor 0. 
# Let's replace that. 
df['floor'] = df['floor'].replace({'-':0})

In [49]:
df

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,0,1,1,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,0,64,2,1,1,10,1,0,R$540,R$820,R$122,R$11,"R$1,493"
2,1,443,5,5,4,3,1,1,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"
3,1,73,2,2,1,12,1,0,R$700,"R$1,250",R$150,R$16,"R$2,116"
4,1,19,1,1,0,0,0,0,R$0,"R$1,200",R$41,R$16,"R$1,257"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,1,50,2,1,1,2,1,0,R$420,"R$1,150",R$0,R$15,"R$1,585"
6076,1,84,2,2,1,16,0,1,R$768,"R$2,900",R$63,R$37,"R$3,768"
6077,0,48,1,1,0,13,1,0,R$250,R$950,R$42,R$13,"R$1,255"
6078,1,160,3,2,2,0,0,0,R$0,"R$3,500",R$250,R$53,"R$3,803"


In [53]:
# for hoa, rent amount, property tax, fire insurance, total we can see some pattern 
# following
# We just have to remove 'R$' and ',' 
# The same task is to do, so let's use for loop. 
columns = df.columns[-5:]
columns

Index(['hoa', 'rent amount', 'property tax', 'fire insurance', 'total'], dtype='object')

In [55]:
df['hoa'].replace({'R\$':'', ',':''}, regex=True)

0          0
1        540
2       4172
3        700
4          0
        ... 
6075     420
6076     768
6077     250
6078       0
6079     489
Name: hoa, Length: 5882, dtype: object

In [57]:
for col in columns:
    df[col].replace({'R\$':'', ',':''}, regex=True, inplace=True)

In [59]:
df

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,0,1,1,0,8000,1000,121,9121
1,0,64,2,1,1,10,1,0,540,820,122,11,1493
2,1,443,5,5,4,3,1,1,4172,7000,1417,89,12680
3,1,73,2,2,1,12,1,0,700,1250,150,16,2116
4,1,19,1,1,0,0,0,0,0,1200,41,16,1257
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,1,50,2,1,1,2,1,0,420,1150,0,15,1585
6076,1,84,2,2,1,16,0,1,768,2900,63,37,3768
6077,0,48,1,1,0,13,1,0,250,950,42,13,1255
6078,1,160,3,2,2,0,0,0,0,3500,250,53,3803


In [61]:
df.dtypes

city               int64
area               int64
rooms              int64
bathroom           int64
parking spaces     int64
floor             object
animal             int64
furniture          int64
hoa               object
rent amount       object
property tax      object
fire insurance    object
total             object
dtype: object

In [65]:
# Let's convert these object columns data types to numerical data types. 
# As we want the whole dataset to be of int type. 
df = df. astype(dtype=np.int64)

ValueError: invalid literal for int() with base 10: 'Sem info'

In [69]:
# So, there is 'sem info' in our dataset. Let's see where it is : 
df.isin(['Sem info']).any()

city              False
area              False
rooms             False
bathroom          False
parking spaces    False
floor             False
animal            False
furniture         False
hoa                True
rent amount       False
property tax      False
fire insurance    False
total             False
dtype: bool

In [73]:
# hoa column has it. Let's replace 'Sem info' with 0 
df['hoa'] = df['hoa'].replace({'Sem info':0})

In [75]:
# As we want the whole dataset to be of int type.
df = df. astype(dtype=np.int64)

ValueError: invalid literal for int() with base 10: 'Incluso'

In [77]:
# There exist another string in our dataset called 'Incluso'. Let's check where it is : 
df.isin(['Incluso']).any()

city              False
area              False
rooms             False
bathroom          False
parking spaces    False
floor             False
animal            False
furniture         False
hoa                True
rent amount       False
property tax       True
fire insurance    False
total             False
dtype: bool

In [81]:
# hoa and property tax column has it. 
df['hoa'] = df['hoa'].replace({'Incluso':0})
df['property tax'] = df['property tax'].replace({'Incluso':0})

In [83]:
# As we want the whole dataset to be of int type.
df = df. astype(dtype=np.int64)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5882 entries, 0 to 6079
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   city            5882 non-null   int64
 1   area            5882 non-null   int64
 2   rooms           5882 non-null   int64
 3   bathroom        5882 non-null   int64
 4   parking spaces  5882 non-null   int64
 5   floor           5882 non-null   int64
 6   animal          5882 non-null   int64
 7   furniture       5882 non-null   int64
 8   hoa             5882 non-null   int64
 9   rent amount     5882 non-null   int64
 10  property tax    5882 non-null   int64
 11  fire insurance  5882 non-null   int64
 12  total           5882 non-null   int64
dtypes: int64(13)
memory usage: 643.3 KB


In [87]:
df

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,0,1,1,0,8000,1000,121,9121
1,0,64,2,1,1,10,1,0,540,820,122,11,1493
2,1,443,5,5,4,3,1,1,4172,7000,1417,89,12680
3,1,73,2,2,1,12,1,0,700,1250,150,16,2116
4,1,19,1,1,0,0,0,0,0,1200,41,16,1257
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,1,50,2,1,1,2,1,0,420,1150,0,15,1585
6076,1,84,2,2,1,16,0,1,768,2900,63,37,3768
6077,0,48,1,1,0,13,1,0,250,950,42,13,1255
6078,1,160,3,2,2,0,0,0,0,3500,250,53,3803


In [89]:
# Our data set is ready to feed the model. 
X = df.drop('city', axis=1)
y = df['city']

In [91]:
# Lets split train test data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [93]:
X_train

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
3181,202,3,2,4,0,1,0,0,3500,608,53,4161
3978,55,1,1,1,4,0,1,812,3200,0,41,4053
5050,220,4,2,3,0,1,1,0,5200,705,79,5984
3611,126,3,2,0,1,1,0,150,2210,125,28,2513
2283,136,3,3,3,15,1,1,1450,7900,490,101,9941
...,...,...,...,...,...,...,...,...,...,...,...,...
3858,50,1,1,1,10,0,1,1436,7800,359,99,9694
5354,70,3,2,1,3,1,1,650,1300,55,17,2022
5392,130,2,2,2,5,1,1,1500,12000,45,153,13700
5561,65,2,2,1,1,1,0,643,2300,63,30,3036


In [97]:
# Our x_train and x_test needs scaling 
from sklearn.preprocessing import MinMaxScaler 
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [99]:
X_train_scaled

array([[1.50145429e-02, 2.22222222e-01, 1.11111111e-01, ...,
        1.93815748e-03, 7.41839763e-02, 1.10707058e-02],
       [3.45884757e-03, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 5.63798220e-02, 1.07291930e-02],
       [1.64295260e-02, 3.33333333e-01, 1.11111111e-01, ...,
        2.24737010e-03, 1.12759644e-01, 1.68353150e-02],
       ...,
       [9.35461049e-03, 1.11111111e-01, 1.11111111e-01, ...,
        1.43449155e-04, 2.22551929e-01, 4.12345054e-02],
       [4.24494930e-03, 1.11111111e-01, 1.11111111e-01, ...,
        2.00828817e-04, 4.00593472e-02, 7.51328105e-03],
       [1.64295260e-02, 2.22222222e-01, 2.22222222e-01, ...,
        1.49187121e-03, 1.14243323e-01, 2.43074880e-02]])

In [101]:
X_test_scaled

array([[2.76467718e-03, 1.66666667e-01, 0.00000000e+00, ...,
        4.44990445e-04, 7.28476821e-02, 2.77461123e-03],
       [2.35810701e-03, 1.66666667e-01, 0.00000000e+00, ...,
        2.02020202e-04, 4.30463576e-02, 2.83375992e-03],
       [4.06570174e-04, 0.00000000e+00, 0.00000000e+00, ...,
        1.82910183e-04, 1.22516556e-01, 7.10590842e-03],
       ...,
       [9.75768418e-03, 5.00000000e-01, 3.75000000e-01, ...,
        0.00000000e+00, 1.42384106e-01, 1.47602865e-02],
       [1.54496666e-03, 1.66666667e-01, 0.00000000e+00, ...,
        2.45700246e-04, 3.97350993e-02, 1.58357172e-03],
       [5.32606928e-03, 3.33333333e-01, 2.50000000e-01, ...,
        1.11657112e-03, 1.92052980e-01, 9.98537414e-03]])

In [103]:
# Model Training : 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 

In [105]:
log_model = LogisticRegression(verbose=1)
svm_model = SVC(verbose=1)

In [107]:
log_model.fit(X_train_scaled, y_train)
svm_model.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[LibSVM]

In [109]:
print(log_model.score(X_test_scaled, y_test))
print(svm_model.score(X_test_scaled, y_test))

0.8564146134239592
0.8564146134239592


In [111]:
# This is because of the IMBALANCED DATA 
df['city'].value_counts()

city
1    5090
0     792
Name: count, dtype: int64