In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,accuracy_score,f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('NYC.csv')
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
#Summary of the DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [4]:
#Checking for the NUll Values
data.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

### Dealing with Null Values

In [5]:
#Dropping column id , host_name not important and last_review is having too many missing value
data.drop(['id','host_name','last_review'],axis=1,inplace=True)
data.head(5)

Unnamed: 0,name,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Clean & quiet apt home by the park,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Skylit Midtown Castle,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,THE VILLAGE OF HARLEM....NEW YORK !,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,1,365
3,Cozy Entire Floor of Brownstone,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,Entire Apt: Spacious Studio/Loft by central park,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0


In [6]:
data['reviews_per_month'].unique()

array([2.100e-01, 3.800e-01,       nan, 4.640e+00, 1.000e-01, 5.900e-01,
       4.000e-01, 3.470e+00, 9.900e-01, 1.330e+00, 4.300e-01, 1.500e+00,
       1.340e+00, 9.100e-01, 2.200e-01, 1.200e+00, 1.720e+00, 2.120e+00,
       4.440e+00, 7.000e-02, 1.090e+00, 3.700e-01, 6.100e-01, 7.300e-01,
       1.370e+00, 4.900e-01, 1.110e+00, 2.400e-01, 2.040e+00, 1.420e+00,
       1.650e+00, 2.370e+00, 6.600e-01, 1.410e+00, 1.960e+00, 1.810e+00,
       2.080e+00, 3.900e-01, 2.300e-01, 6.900e-01, 8.400e-01, 2.250e+00,
       5.200e-01, 1.160e+00, 1.010e+00, 6.300e-01, 7.000e-01, 2.820e+00,
       9.000e-01, 1.700e-01, 2.490e+00, 1.190e+00, 3.000e-01, 1.200e-01,
       5.700e-01, 1.600e-01, 4.720e+00, 1.400e+00, 1.260e+00, 1.640e+00,
       1.600e+00, 9.200e-01, 2.000e-01, 1.280e+00, 5.400e-01, 6.200e-01,
       1.500e-01, 5.300e-01, 1.730e+00, 5.000e-02, 1.540e+00, 2.800e-01,
       3.400e+00, 1.570e+00, 1.050e+00, 7.100e-01, 1.100e-01, 2.700e-01,
       1.230e+00, 8.700e-01, 2.090e+00, 6.000e-01, 

In [7]:
#replacing all NaN values in 'reviews_per_month' with 0
data['reviews_per_month'].fillna(0,inplace=True)

In [8]:
data['name'].unique()

array(['Clean & quiet apt home by the park', 'Skylit Midtown Castle',
       'THE VILLAGE OF HARLEM....NEW YORK !', ...,
       'Sunny Studio at Historical Neighborhood',
       '43rd St. Time Square-cozy single bed',
       "Trendy duplex in the very heart of Hell's Kitchen"], dtype=object)

In [9]:
#Removing the containing NULL value
data.dropna(axis=0,how='any',inplace=True)

In [10]:
data.isnull().sum()

name                              0
host_id                           0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48879 entries, 0 to 48894
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   name                            48879 non-null  object 
 1   host_id                         48879 non-null  int64  
 2   neighbourhood_group             48879 non-null  object 
 3   neighbourhood                   48879 non-null  object 
 4   latitude                        48879 non-null  float64
 5   longitude                       48879 non-null  float64
 6   room_type                       48879 non-null  object 
 7   price                           48879 non-null  int64  
 8   minimum_nights                  48879 non-null  int64  
 9   number_of_reviews               48879 non-null  int64  
 10  reviews_per_month               48879 non-null  float64
 11  calculated_host_listings_count  48879 non-null  int64  
 12  availability_365                

In [12]:
#Separating the Numeric and Categorical Data
data_num=data.select_dtypes(['int64','float64'])
data_cat=data.select_dtypes('object')

In [13]:
#Numeric Data
data_num.head()

Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,2787,40.64749,-73.97237,149,1,9,0.21,6,365
1,2845,40.75362,-73.98377,225,1,45,0.38,2,355
2,4632,40.80902,-73.9419,150,3,0,0.0,1,365
3,4869,40.68514,-73.95976,89,1,270,4.64,1,194
4,7192,40.79851,-73.94399,80,10,9,0.1,1,0


In [14]:
#Categorical Data
data_cat.head()

Unnamed: 0,name,neighbourhood_group,neighbourhood,room_type
0,Clean & quiet apt home by the park,Brooklyn,Kensington,Private room
1,Skylit Midtown Castle,Manhattan,Midtown,Entire home/apt
2,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,Harlem,Private room
3,Cozy Entire Floor of Brownstone,Brooklyn,Clinton Hill,Entire home/apt
4,Entire Apt: Spacious Studio/Loft by central park,Manhattan,East Harlem,Entire home/apt


In [15]:
#LabelEncoding of categorical Data
from sklearn.preprocessing import LabelEncoder
for col in data_cat:
    le=LabelEncoder()
    data_cat[col]=le.fit_transform(data_cat[[col]])
data_cat.head()

Unnamed: 0,name,neighbourhood_group,neighbourhood,room_type
0,12328,1,108,1
1,37455,2,127,0
2,43543,2,94,1
3,14783,1,41,0
4,18693,2,61,0


In [16]:
#Combining back numeric and categorical data
df=pd.concat([data_num,data_cat],axis=1)
df.head()

Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,name,neighbourhood_group,neighbourhood,room_type
0,2787,40.64749,-73.97237,149,1,9,0.21,6,365,12328,1,108,1
1,2845,40.75362,-73.98377,225,1,45,0.38,2,355,37455,2,127,0
2,4632,40.80902,-73.9419,150,3,0,0.0,1,365,43543,2,94,1
3,4869,40.68514,-73.95976,89,1,270,4.64,1,194,14783,1,41,0
4,7192,40.79851,-73.94399,80,10,9,0.1,1,0,18693,2,61,0


In [20]:
x=df.drop('price',axis=1)
y=df['price']

In [21]:
y.head()

0    149
1    225
2    150
3     89
4     80
Name: price, dtype: int64

In [22]:
#Scaling of Data
from sklearn.preprocessing import MinMaxScaler
for col in x:
    mn=MinMaxScaler()
    x[col]=mn.fit_transform(x[[col]])
x.head()

Unnamed: 0,host_id,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,name,neighbourhood_group,neighbourhood,room_type
0,1e-06,0.357393,0.511921,0.0,0.014308,0.00359,0.015337,1.0,0.257348,0.25,0.490909,0.5
1,1e-06,0.614199,0.490469,0.0,0.071542,0.006496,0.003067,0.972603,0.781876,0.5,0.577273,0.0
2,8e-06,0.748252,0.569257,0.001601,0.0,0.0,0.0,1.0,0.908964,0.5,0.427273,0.5
3,9e-06,0.448496,0.535649,0.0,0.429253,0.079316,0.0,0.531507,0.308596,0.25,0.186364,0.0
4,1.7e-05,0.72282,0.565324,0.007206,0.014308,0.001709,0.0,0.0,0.390218,0.5,0.277273,0.0


In [23]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

log=LinearRegression()
log.fit(x_train,y_train)
print("Linear Regression: ""Train score: ",log.score(x_train,y_train),"Test Score: ",log.score(x_test,y_test))

print("\n")
dt= DecisionTreeClassifier()
dt.fit(x_train,y_train)
print("Decision Tree: ""Train Score",dt.score(x_train,y_train),"Test Score: ",dt.score(x_test,y_test))

print("\n")
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
print("Random Forest Classifier: ""Train Score: ",rfc.score(x_train,y_train),"Test Score: ", rfc.score(x_test,y_test))

print("\n")
rfc=RandomForestClassifier(n_estimators=50,criterion="entropy")
rfc.fit(x_train,y_train)
print("Random Forest Classifier Entropy: ""Train Score: ",rfc.score(x_train,y_train),"Test Score: ", rfc.score(x_test,y_test))

print("\n")
knn=KNeighborsClassifier(n_neighbors=8)
knn.fit(x_train,y_train)
print("KNN: ""Train Score: ",knn.score(x_train,y_train),"Test Score: ",knn.score(x_test,y_test))

Linear Regression: Train score:  0.08493411362295944 Test Score:  0.10689634483432009


Decision Tree: Train Score 1.0 Test Score:  0.061442989634478994




MemoryError: could not allocate 321388544 bytes

### Feature Selection

In [25]:
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest
x=df.drop('price',axis=1)
y=df['price']
annova = SelectKBest(score_func=f_regression,k=10)
fit = annova.fit(x,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Column Name','Score'] 
print(featureScores)

                       Column Name        Score
0                          host_id    11.475362
1                         latitude    56.366862
2                        longitude  1124.680099
3                   minimum_nights    89.743230
4                number_of_reviews   112.700117
5                reviews_per_month   125.336230
6   calculated_host_listings_count   161.971063
7                 availability_365   329.292825
8                             name     1.892736
9              neighbourhood_group    95.682701
10                   neighbourhood   189.216421
11                       room_type  3239.280330


In [26]:
print(featureScores.nlargest(10,'Score'))

                       Column Name        Score
11                       room_type  3239.280330
2                        longitude  1124.680099
7                 availability_365   329.292825
10                   neighbourhood   189.216421
6   calculated_host_listings_count   161.971063
5                reviews_per_month   125.336230
4                number_of_reviews   112.700117
9              neighbourhood_group    95.682701
3                   minimum_nights    89.743230
1                         latitude    56.366862


In [27]:
x_annova = annova.transform(x)
x_annova

array([[ 40.64749, -73.97237,   1.     , ...,   1.     , 108.     ,
          1.     ],
       [ 40.75362, -73.98377,   1.     , ...,   2.     , 127.     ,
          0.     ],
       [ 40.80902, -73.9419 ,   3.     , ...,   2.     ,  94.     ,
          1.     ],
       ...,
       [ 40.81475, -73.94867,  10.     , ...,   2.     ,  94.     ,
          0.     ],
       [ 40.75751, -73.99112,   1.     , ...,   2.     ,  95.     ,
          2.     ],
       [ 40.76404, -73.98933,   7.     , ...,   2.     ,  95.     ,
          1.     ]])

### Applyning all model on feature selection 

In [29]:
x_train,x_test,y_train,y_test = train_test_split(x_annova,y,test_size=0.3)

log=LinearRegression()
log.fit(x_train,y_train)
print("Linear Regression: ""Train score: ",log.score(x_train,y_train),"Test Score: ",log.score(x_test,y_test))

print("\n")
dt= DecisionTreeClassifier()
dt.fit(x_train,y_train)
print("Decision Tree: ""Train Score",dt.score(x_train,y_train),"Test Score: ",dt.score(x_test,y_test))

print("\n")
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
print("Random Forest Classifier: ""Train Score: ",rfc.score(x_train,y_train),"Test Score: ", rfc.score(x_test,y_test))

print("\n")
rfc=RandomForestClassifier(n_estimators=50,criterion="entropy")
rfc.fit(x_train,y_train)
print("Random Forest Classifier Entropy: ""Train Score: ",rfc.score(x_train,y_train),"Test Score: ", rfc.score(x_test,y_test))


print("\n")
knn=KNeighborsClassifier(n_neighbors=8)
knn.fit(x_train,y_train)
print("KNN: ""Train Score: ",knn.score(x_train,y_train),"Test Score: ",knn.score(x_test,y_test))


Linear Regression: Train score:  0.08435199568383034 Test Score:  0.10904800100652734




MemoryError: could not allocate 325058560 bytes