In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("AB_NYC_2019.csv")

In [3]:
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

## **Encoding**

In [5]:
from sklearn.preprocessing import LabelEncoder , OneHotEncoder

In [6]:
data['neighbourhood_group'].value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

# **Label Encoder**

In [8]:
le = LabelEncoder()
data['neighbourhood_group']=le.fit_transform(data['neighbourhood_group']) 

In [9]:
data['neighbourhood_group'].value_counts()

2    21661
1    20104
3     5666
0     1091
4      373
Name: neighbourhood_group, dtype: int64

In [10]:
le.classes_

array(['Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island'],
      dtype=object)

# **OneHot Encoder**

In [11]:
data['room_type'].value_counts()

Entire home/apt    25409
Private room       22326
Shared room         1160
Name: room_type, dtype: int64

In [16]:
one_hot = OneHotEncoder()
transformed_data = one_hot.fit_transform(data['room_type'].values.reshape(-1,1)).toarray()

In [17]:
one_hot.categories_

[array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)]

In [18]:
transformed_data = pd.DataFrame(transformed_data , 
                                columns = ['Entire home/apt', 'Private room', 'Shared room'])

In [19]:
transformed_data.head()

Unnamed: 0,Entire home/apt,Private room,Shared room
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


In [20]:
transformed_data.iloc[90 , ]

Entire home/apt    0.0
Private room       1.0
Shared room        0.0
Name: 90, dtype: float64

In [21]:
data['room_type'][90]

'Private room'

# **Normalization & Standardization**

In [23]:
# consider only numerical columns

numeric_columns = [c for c in data.columns if data[c].dtype != np.dtype('O')]

In [24]:
len(numeric_columns) , len(data.columns)

(11, 16)

In [39]:
numeric_columns.remove('reviews_per_month')


In [40]:
temp_data = data[numeric_columns]


In [41]:
temp_data

Unnamed: 0,id,host_id,neighbourhood_group,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,2539,2787,1,149,1,9,6,365
1,2595,2845,2,225,1,45,2,355
2,3647,4632,2,150,3,0,1,365
3,3831,4869,1,89,1,270,1,194
4,5022,7192,2,80,10,9,1,0
...,...,...,...,...,...,...,...,...
48890,36484665,8232441,1,70,2,0,2,9
48891,36485057,6570630,1,40,4,0,2,36
48892,36485431,23492952,2,115,10,0,1,27
48893,36485609,30985759,2,55,1,0,6,2


In [42]:
from sklearn.preprocessing import StandardScaler , MinMaxScaler

# **Normalization**

In [43]:
import warnings
warnings.filterwarnings('ignore')

In [44]:
normalizer = MinMaxScaler()

In [45]:
temp_data.dropna(axis = 1 , inplace = True)

In [47]:
normalized_data = normalizer.fit_transform(temp_data)

In [48]:
pd.DataFrame(normalized_data , columns = temp_data.columns)

Unnamed: 0,id,host_id,neighbourhood_group,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,0.000000,0.000001,0.25,0.0149,0.000000,0.014308,0.015337,1.000000
1,0.000002,0.000001,0.50,0.0225,0.000000,0.071542,0.003067,0.972603
2,0.000030,0.000008,0.50,0.0150,0.001601,0.000000,0.000000,1.000000
3,0.000035,0.000009,0.25,0.0089,0.000000,0.429253,0.000000,0.531507
4,0.000068,0.000017,0.50,0.0080,0.007206,0.014308,0.000000,0.000000
...,...,...,...,...,...,...,...,...
48890,0.999929,0.030002,0.25,0.0070,0.000801,0.000000,0.003067,0.024658
48891,0.999940,0.023944,0.25,0.0040,0.002402,0.000000,0.003067,0.098630
48892,0.999950,0.085632,0.50,0.0115,0.007206,0.000000,0.000000,0.073973
48893,0.999955,0.112946,0.50,0.0055,0.000000,0.000000,0.015337,0.005479


# **Standrization**

In [49]:
standard_scaler = StandardScaler()

In [50]:
standardized_data = standard_scaler.fit_transform(temp_data)

In [51]:
pd.DataFrame(standardized_data , columns = temp_data.columns)

Unnamed: 0,id,host_id,neighbourhood_group,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,-1.731277,-0.860159,-0.917828,-0.015493,-0.293996,-0.320414,-0.034716,1.916250
1,-1.731272,-0.860158,0.441222,0.300974,-0.293996,0.487665,-0.156104,1.840275
2,-1.731176,-0.860135,0.441222,-0.011329,-0.196484,-0.522433,-0.186451,1.916250
3,-1.731159,-0.860132,-0.917828,-0.265335,-0.293996,5.538156,-0.186451,0.617065
4,-1.731051,-0.860103,0.441222,-0.302811,0.144807,-0.320414,-0.186451,-0.856865
...,...,...,...,...,...,...,...,...
48890,1.590415,-0.755469,-0.917828,-0.344452,-0.245240,-0.522433,-0.156104,-0.788486
48891,1.590451,-0.776609,-0.917828,-0.469373,-0.147729,-0.522433,-0.156104,-0.583352
48892,1.590485,-0.561340,0.441222,-0.157070,0.144807,-0.522433,-0.186451,-0.651730
48893,1.590501,-0.466024,0.441222,-0.406912,-0.293996,-0.522433,-0.034716,-0.841669


# **Handling With Missing Values**

In [52]:
data.isnull().sum()


id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [59]:
data['host_name'].isnull().sum()

21

**Simple Imputer**

In [54]:
from sklearn.impute import SimpleImputer

In [55]:
imputer = SimpleImputer(missing_values=np.nan , strategy='mean')

In [63]:
reviews_col = imputer.fit_transform(data['reviews_per_month'].values.reshape(-1,1))

In [66]:
pd.DataFrame(reviews_col ).isnull().sum()

0    0
dtype: int64

In [68]:
data['reviews_per_month'].isnull().sum()

10052

# **Discretization**

In [69]:
from sklearn.preprocessing import KBinsDiscretizer

In [70]:
temp_data.head()

Unnamed: 0,id,host_id,neighbourhood_group,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,2539,2787,1,149,1,9,6,365
1,2595,2845,2,225,1,45,2,355
2,3647,4632,2,150,3,0,1,365
3,3831,4869,1,89,1,270,1,194
4,5022,7192,2,80,10,9,1,0


# **Quantile Discretization Transform**

In [71]:
trans = KBinsDiscretizer(n_bins =10 , encode = 'ordinal' , strategy='quantile')
new_data = trans.fit_transform(temp_data)

In [72]:
pd.DataFrame(new_data,columns = temp_data.columns )

Unnamed: 0,id,host_id,neighbourhood_group,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,0.0,0.0,1.0,6.0,0.0,4.0,3.0,6.0
1,0.0,0.0,2.0,8.0,0.0,6.0,1.0,6.0
2,0.0,0.0,2.0,6.0,2.0,0.0,0.0,6.0
3,0.0,0.0,1.0,3.0,0.0,7.0,0.0,4.0
4,0.0,0.0,2.0,3.0,4.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...
48890,9.0,2.0,1.0,2.0,1.0,0.0,1.0,1.0
48891,9.0,2.0,1.0,0.0,3.0,0.0,1.0,1.0
48892,9.0,4.0,2.0,5.0,4.0,0.0,0.0,1.0
48893,9.0,5.0,2.0,1.0,0.0,0.0,3.0,0.0


# **Uniform Discretization Transform**

In [73]:
trans = KBinsDiscretizer(n_bins =10 , encode = 'ordinal' , strategy='uniform')
new_data = trans.fit_transform(temp_data)

pd.DataFrame(new_data,columns = temp_data.columns )

Unnamed: 0,id,host_id,neighbourhood_group,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0
1,0.0,0.0,5.0,0.0,0.0,0.0,0.0,9.0
2,0.0,0.0,5.0,0.0,0.0,0.0,0.0,9.0
3,0.0,0.0,2.0,0.0,0.0,4.0,0.0,5.0
4,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
48890,9.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
48891,9.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
48892,9.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
48893,9.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0


# **KMeans Discretization Transform**

In [74]:
trans = KBinsDiscretizer(n_bins =10 , encode = 'ordinal' , strategy='kmeans')
new_data = trans.fit_transform(temp_data)

pd.DataFrame(new_data,columns = temp_data.columns )

Unnamed: 0,id,host_id,neighbourhood_group,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9.0
1,0.0,0.0,2.0,1.0,0.0,2.0,1.0,9.0
2,0.0,0.0,2.0,0.0,1.0,0.0,0.0,9.0
3,0.0,0.0,1.0,0.0,0.0,7.0,0.0,5.0
4,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
48890,9.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
48891,9.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
48892,9.0,1.0,2.0,0.0,2.0,0.0,0.0,1.0
48893,9.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0
