In [194]:
##Importing most of the libraries we have used in our assignments that we will use in this project

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import numpy as np 
import pandas as pd
from numpy import array
from numpy import argmax


# Read in the CSV file we will be using for this project 
df = pd.read_csv("./AB_NYC_2019.csv")
df[0::10]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
10,5295,Beautiful 1br on Upper West Side,7702,Lena,Manhattan,Upper West Side,40.80316,-73.96545,Entire home/apt,135,5,53,2019-06-22,0.43,1,6
20,7801,Sweet and Spacious Brooklyn Loft,21207,Chaya,Brooklyn,Williamsburg,40.71842,-73.95718,Entire home/apt,299,3,9,2011-12-28,0.07,1,0
30,9668,front room/double bed,32294,Ssameer Or Trip,Manhattan,Harlem,40.82245,-73.95104,Private room,50,3,242,2019-06-01,2.04,3,355
40,12192,ENJOY Downtown NYC!,46978,Edward,Manhattan,East Village,40.72290,-73.98199,Private room,68,2,245,2019-06-21,2.08,2,96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48850,36455584,Large studio at Union Square! for 3-5 ppl,50812891,Molo,Manhattan,East Village,40.73231,-73.98689,Entire home/apt,159,1,0,,,1,166
48860,36468386,纽约罗岛Roosevelt Island整租或合租 窗外美景 设施全 家具新 到曼哈顿方便 ...,228268650,Yan,Manhattan,Roosevelt Island,40.76688,-73.94688,Entire home/apt,145,1,0,,,1,30
48870,36474911,"Cozy, clean Williamsburg 1- bedroom apartment",1273444,Tanja,Brooklyn,Williamsburg,40.71197,-73.94946,Entire home/apt,99,4,0,,,1,22
48880,36481315,The Raccoon Artist Studio in Williamsburg New ...,208514239,Melki,Brooklyn,Williamsburg,40.71232,-73.94220,Entire home/apt,120,1,0,,,3,365


## Data analysis

In [199]:
# Price
df["price"].describe()

count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

In [200]:
# neighbourhood groups
df['neighbourhood_group'].value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

In [202]:
# Avg Price in neighbourhood groups
avg_price = df.groupby("neighbourhood_group")["price"].mean()

print(avg_price)

neighbourhood_group
Bronx             87.496792
Brooklyn         124.383207
Manhattan        196.875814
Queens            99.517649
Staten Island    114.812332
Name: price, dtype: float64


## Encode Non-Numerical Values to 0 and 1. Drop unnecessary columns from DataFrame and replace NaN values with 0.

In [195]:
# Categorical Features: neighbourhood_group, neighbourhood, room_type
encode_NYC_df = pd.get_dummies(df, columns = ['neighbourhood_group','room_type'])

# Dropping unnecessary features
encode_NYC_df = encode_NYC_df.drop(columns=['id', 'name', 'host_name', 
                      'last_review', 'host_id', 'neighbourhood'])

# Replace NaN with 0
encode_NYC_df['reviews_per_month']=encode_NYC_df['reviews_per_month'].replace(np.nan, 0)
encode_NYC_df['price']=encode_NYC_df['price'].replace(np.nan, 0)

In [196]:
# Move price to end of list
cols = encode_NYC_df.columns.tolist()
cols.append(cols.pop(cols.index('price')))
encode_NYC_df = encode_NYC_df.reindex(columns = cols)

# Sort by price
encode_NYC_df.sort_values(by='price',ascending=False,inplace=True)

encode_NYC_df.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price
9151,40.7681,-73.91651,100,2,0.04,1,0,0,0,0,1,0,0,1,0,10000
17692,40.7326,-73.95739,5,5,0.16,1,0,0,1,0,0,0,1,0,0,10000
29238,40.77213,-73.98665,30,0,0.0,1,83,0,0,1,0,0,1,0,0,10000
40433,40.7198,-73.98566,30,0,0.0,1,365,0,0,1,0,0,1,0,0,9999
12342,40.71355,-73.98507,99,6,0.14,1,83,0,0,1,0,0,0,1,0,9999


## Split the Dataset

In [197]:
# Feature_column and encoding
feature_cols = cols
X = encode_NYC_df[feature_cols]
y = encode_NYC_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the first 5 lines
print(X_test)
print('\n')
print(y_test)


# print the size of the traning set:
print(X_train.shape)
print(y_train.shape)

# print the size of the testing set:
print(X_test.shape)
print(y_test.shape)

       latitude  longitude  minimum_nights  number_of_reviews  \
45894  40.71550  -73.94656               2                  0   
20869  40.71753  -73.94946               2                  1   
38075  40.77829  -73.95093               5                  5   
11825  40.62402  -73.97071               1                  1   
41261  40.72595  -73.90299               1                 24   
...         ...        ...             ...                ...   
45273  40.70372  -73.90670               2                  1   
449    40.68288  -73.96024               3                 43   
5464   40.71305  -73.94334               6                  0   
22940  40.74530  -74.00002               3                 24   
16518  40.67323  -73.88920               3                 71   

       reviews_per_month  calculated_host_listings_count  availability_365  \
45894               0.00                               3               313   
20869               0.03                               1       

In [198]:
# "my_logreg" is instantiated as an "object" of LogisticRegression "class". 
my_logreg = LogisticRegression(max_iter=10000000000000)

# Training ONLY on the training set:
my_logreg.fit(X_train, y_train)

# Testing on the testing set:
y_predict_lr = my_logreg.predict(X_test)
print(y_predict_lr)

# We can now compare the "predicted labels" for the Testing Set with its "actual labels" to evaluate the accuracy 

score_lr = accuracy_score(y_test, y_predict_lr)

print(score_lr)

KeyboardInterrupt: 