# Data Cleaning

This notebook will focus on the preprocessing of the datasets before using them in recommendation system (Content-based) I will be developing later on.

In [2]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### Apartments Dataset

Keys:
- Apartment_id: Unique id identifying an apartment
- Posted On: Date on which the entry is made
- BHK: Number of Bedrooms, Hall, Kitchen.
- Rent: Rent of the Houses/Apartments/Flats.
- Size: Size of the Houses/Apartments/Flats in Square Feet.
- Floor: Houses/Apartments/Flats situated in which Floor and Total Number of Floors (Example: Ground out of 2, 3 out of 5, etc.)
- Area Type: Size of the Houses/Apartments/Flats calculated on either Super Area or Carpet Area or Build Area.
- Area Locality: Locality of the Houses/Apartments/Flats.
- City: City where the Houses/Apartments/Flats are Located.
- Furnishing Status: Furnishing Status of the Houses/Apartments/Flats, either it is Furnished or Semi-Furnished or Unfurnished.
- Tenant Preferred: Type of Tenant Preferred by the Owner or Agent.
- Bathroom: Number of Bathrooms.
- Point of Contact: Whom should you contact for more information regarding the Houses/Apartments/Flats.

In [3]:
#Load apartment dataset
apartments_df = pd.read_csv("datasets/House_Rent_Dataset.csv")
apartments_df.head()

Unnamed: 0,Apartment_id,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,1,18-05-2022,2,10000,1100.0,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2,13-05-2022,2,20000,800.0,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,3,16-05-2022,2,17000,1000.0,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,4,04-07-2022,2,10000,800.0,1 out of 2,Super Area,Dumdum Park,Kolkata,,Bachelors/Family,1,Contact Owner
4,5,09-05-2022,2,7500,850.0,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [4]:
# number of rows and columns in the dataframe
apartments_df.shape

(4746, 13)

In [5]:
# are the columns using suitable datatypes
apartments_df.dtypes

Apartment_id           int64
Posted On             object
BHK                    int64
Rent                   int64
Size                 float64
Floor                 object
Area Type             object
Area Locality         object
City                  object
Furnishing Status     object
Tenant Preferred      object
Bathroom               int64
Point of Contact      object
dtype: object

In [6]:
#Replacing the unknown values with nan.
apartments_df.replace({'Tenant Preferred': {'unknown': np.nan}}, inplace=True)

In [7]:
# Check which columns have missing values
apartments_df.isnull().any()

Apartment_id         False
Posted On            False
BHK                  False
Rent                 False
Size                  True
Floor                False
Area Type             True
Area Locality        False
City                 False
Furnishing Status     True
Tenant Preferred      True
Bathroom             False
Point of Contact     False
dtype: bool

In [8]:
# How many missing values do we have for each column?
print('size of dataframe:',apartments_df.shape)
apartments_df.isnull().sum()

size of dataframe: (4746, 13)


Apartment_id           0
Posted On              0
BHK                    0
Rent                   0
Size                 171
Floor                  0
Area Type             40
Area Locality          0
City                   0
Furnishing Status     52
Tenant Preferred      95
Bathroom               0
Point of Contact       0
dtype: int64

In [9]:
#drop the rows with nan values
apartments_df.dropna(inplace = True)
apartments_df.shape

(4397, 13)

In [10]:
# How many missing values do we have for each column?
print('size of dataframe:',apartments_df.shape)
apartments_df.isnull().sum()

size of dataframe: (4397, 13)


Apartment_id         0
Posted On            0
BHK                  0
Rent                 0
Size                 0
Floor                0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
dtype: int64

### Tenent Review Dataset

Keys:
- tenant_id - non identifiable randomly generated user id.
- anime_id - apartment that tenant previously stayed in.
- rating - rating out of 10 this tenant has assigned.

In [11]:
#loading tenants dataframe
tenant_df = pd.read_csv("Datasets/Tenants_Review_Dataset.csv")
tenant_df.head()

Unnamed: 0,tenant_id,Apartment_id,rating
0,1,2731,10.0
1,1,2762,9.0
2,1,2479,6.0
3,1,43,10.0
4,2,2652,9.0


In [12]:
# number of rows and columns in the dataframe
tenant_df.shape

(1727, 3)

In [13]:
#checking the values that are present in rating column
rating_arr = np.unique(tenant_df['rating'])
rating_arr

array([ 6.,  7.,  8.,  9., 10., nan])

In [14]:
# Check which columns have missing values
tenant_df.isnull().any()

tenant_id       False
Apartment_id    False
rating           True
dtype: bool

In [15]:
# How many missing values do we have for each column?
tenant_df.isnull().sum()

tenant_id        0
Apartment_id     0
rating          49
dtype: int64

In [16]:
#drop the rows with nan values
tenant_df.dropna(inplace = True)
tenant_df.shape

(1678, 3)

In [17]:
# How many missing values do we have for each column?
print('size of dataframe:',tenant_df.shape)
tenant_df.isnull().sum()

size of dataframe: (1678, 3)


tenant_id       0
Apartment_id    0
rating          0
dtype: int64

### Export Dataframes to CSV

In [18]:
apartments_df.to_csv("Datasets/Cleaned_House_Rent_Dataset.csv", index = False)
tenant_df.to_csv("Datasets/Cleaned_Tenants_Review_Dataset.csv", index = False)