In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv -O AB_NYC_2019.csv

--2024-10-11 00:15:14--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7077973 (6.8M) [text/plain]
Saving to: ‘AB_NYC_2019.csv’


2024-10-11 00:15:15 (104 MB/s) - ‘AB_NYC_2019.csv’ saved [7077973/7077973]



## Data preparation

In [4]:
usecols = [
    'room_type', 'neighbourhood_group',
    'latitude', 'longitude', 'price','minimum_nights',
    'number_of_reviews', 'reviews_per_month', 
    'calculated_host_listings_count', 'availability_365'
]

df = pd.read_csv('AB_NYC_2019.csv', usecols=usecols)

In [5]:
df.shape

(48895, 10)

In [6]:
df.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,Manhattan,40.80902,-73.9419,Private room,150,3,0,,1,365
3,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0


In [7]:
df.isnull().sum()

neighbourhood_group                   0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [8]:
df['reviews_per_month'] = df.reviews_per_month.fillna(0)

In [9]:
df['price'].value_counts()

price
100     2051
150     2047
50      1534
60      1458
200     1401
        ... 
930        1
920        1
4100       1
3512       1
1494       1
Name: count, Length: 674, dtype: int64

In [10]:
df['price'] = df['price'] >= 152

In [11]:
df.shape

(48895, 10)

In [12]:
df.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,40.64749,-73.97237,Private room,False,1,9,0.21,6,365
1,Manhattan,40.75362,-73.98377,Entire home/apt,True,1,45,0.38,2,355
2,Manhattan,40.80902,-73.9419,Private room,False,3,0,0.0,1,365
3,Brooklyn,40.68514,-73.95976,Entire home/apt,False,1,270,4.64,1,194
4,Manhattan,40.79851,-73.94399,Entire home/apt,False,10,9,0.1,1,0


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [15]:
df_train.head()

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,40.7276,-73.94495,Entire home/apt,3,29,0.7,13,50
1,Manhattan,40.70847,-74.00498,Private room,1,0,0.0,1,7
2,Bronx,40.83149,-73.92766,Entire home/apt,40,0,0.0,1,0
3,Brooklyn,40.66448,-73.99407,Entire home/apt,2,3,0.08,1,0
4,Manhattan,40.74118,-74.00012,Private room,1,48,1.8,2,67


In [16]:
y_train

array([False, False, False, ...,  True, False, False])

In [17]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [18]:
cat = ['neighbourhood_group', 'room_type']

num = [
    'latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
    'reviews_per_month', 'calculated_host_listings_count',
    'availability_365'
]

## Training the model

You get a convergence warning:

In [19]:
train_dict = df_train[cat + num].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

model = LogisticRegression(solver='lbfgs', C=1.0)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


We can fix this model by using a scaler. You can read more about scalers
[here](https://scikit-learn.org/stable/modules/preprocessing.html).

Also, we'll show you how to use `OneHotEncoding` instead of `DictVectorizer`

## Feature scaling + OHE

In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

First, we prepare the numerical variables. We'll use the scaler for that
and write the results to `X_train_num`:

In [44]:
X_train_num = df_train[num].values

scaler = StandardScaler()
#scaler = MinMaxScaler()

X_train_num = scaler.fit_transform(X_train_num)

The scaler scales the numerical features. Compare the un-scaled version of
latitude with the scaled one:

In [45]:
df_train.latitude.values

array([40.7276 , 40.70847, 40.83149, ..., 40.79994, 40.69585, 40.64438])

In [46]:
X_train_num[:, 0]

array([-0.02524398, -0.37616878,  1.88053632, ...,  1.3017764 ,
       -0.60767275, -1.5518494 ])

In [47]:
X_train_num.shape

(29337, 7)

Now let's process categorical features using `OneHotEncoding`.
We'll write the results to `X_train_cat`:

In [59]:
# ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [60]:
X_train_cat = ohe.fit_transform(df_train[cat].values)

In [61]:
ohe.get_feature_names_out()

array(['x0_Bronx', 'x0_Brooklyn', 'x0_Manhattan', 'x0_Queens',
       'x0_Staten Island', 'x1_Entire home/apt', 'x1_Private room',
       'x1_Shared room'], dtype=object)

In [62]:
X_train_cat.shape

(29337, 8)

In [63]:
df_train[cat]

Unnamed: 0,neighbourhood_group,room_type
0,Brooklyn,Entire home/apt
1,Manhattan,Private room
2,Bronx,Entire home/apt
3,Brooklyn,Entire home/apt
4,Manhattan,Private room
...,...,...
29332,Brooklyn,Private room
29333,Brooklyn,Private room
29334,Manhattan,Private room
29335,Brooklyn,Private room


In [64]:
df_train[num]

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,40.72760,-73.94495,3,29,0.70,13,50
1,40.70847,-74.00498,1,0,0.00,1,7
2,40.83149,-73.92766,40,0,0.00,1,0
3,40.66448,-73.99407,2,3,0.08,1,0
4,40.74118,-74.00012,1,48,1.80,2,67
...,...,...,...,...,...,...,...
29332,40.71748,-73.95685,6,5,0.13,1,0
29333,40.66397,-73.98538,1,7,0.17,2,0
29334,40.79994,-73.97001,1,1,0.64,1,88
29335,40.69585,-73.96344,60,0,0.00,1,0


Now we need to combine two matrices into one - `X_train`:

In [65]:
X_train = np.column_stack([X_train_num, X_train_cat])

And now let's train the model:

In [66]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, y_train)

We can check it's accuracy:

In [67]:
X_val_num = df_val[num].values
X_val_num = scaler.transform(X_val_num)

X_val_cat = ohe.transform(df_val[cat].values)

X_val = np.column_stack([X_val_num, X_val_cat])

In [68]:
y_pred = model.predict_proba(X_val)[:, 1]
accuracy_score(y_val, y_pred >= 0.5)

0.7978320891706718

It's a little bit better than the version without scaled features.