In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'): 
    for filename in filenames: print(os.path.join(dirname, filename))

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV , train_test_split
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.metrics import roc_auc_score , r2_score

## Uploading data

In [1]:
data = pd.read_csv('../input/new-york-city-airbnb-open-data/AB_NYC_2019.csv')

In [1]:
data.head()

In [1]:
data.info()

In [1]:
data.isnull().sum()

## Distribution accrued and categorical features

In [1]:
num_cols = ['id',
            'host_id',
            'latitude',
            'longitude',
            'price',
            'minimum_nights',
            'number_of_reviews',
            'reviews_per_month',
            'calculated_host_listings_count',
            'availability_365']
cat_cols = ['last_review',
            'room_type',
            'neighbourhood',
            'neighbourhood_group',
            'host_name',
            'name']

## Displaying data for numeric attributes

In [1]:
corr = data[num_cols].corr()
ax , fig  = plt.subplots(figsize =(15,15))
sns.heatmap(corr ,vmin = -1 , cmap='coolwarm', annot = True)
plt.show()

In [1]:
plt.figure(figsize=(10, 100), dpi=100)
n = len(num_cols)
for i, col in enumerate(num_cols):
    plt.subplot(2*n, 2, 2*i+1)
    plt.hist(data[col], bins=10)
    plt.title(col)
    plt.subplot(2*n, 2, 2*i+2)
    plt.boxplot(data[col].values)

## Displaying data for categorical features

In [1]:
sns.countplot(x="room_type", data=data);

In [1]:
plt.figure(figsize=(14, 6));
sns.barplot(data['neighbourhood_group'], data['price'], hue=data['room_type']);

## Replacing Nan values with 0

In [1]:
data.fillna({'reviews_per_month':0}, inplace=True)
data.fillna({'name':"NoName"}, inplace=True)
data.fillna({'host_name':"NoName"}, inplace=True)
data.fillna({'last_review':"NotReviewed"}, inplace=True)

In [1]:
data.isnull().sum()

In [1]:
data.head()

## Processing of categorical features

In [1]:
cat_data = [ 'neighbourhood_group' , 
             'room_type' ,
            'neighbourhood',
                ]

In [1]:
le = LabelEncoder()
for col in cat_data:
    data[col] = le.fit_transform(data[col])
    data[col] = data[col].astype('int')

In [1]:
data.head()

## Selecting features for a dataset

In [1]:
feature_data  = ['price' ,
                 'neighbourhood_group'  ,
                  'neighbourhood' ,
                  'room_type' ,
                 'minimum_nights' ,
                 'calculated_host_listings_count',
                 'availability_365']

In [1]:
data[feature_data].head()

In [1]:
next_data = data[feature_data]

In [1]:
y = next_data['price']
X = next_data.drop(['price'],axis=1)
X_train, X_test, y_train , y_test=train_test_split(X, y, test_size=0.2)

## LogisticRegression and GridSearchCV

In [1]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [1]:
print('R-squared train score: {:.3f}'.format(model.score(X_train, y_train)))
print('R-squared test score: {:.3f}'.format(model.score(X_test, y_test)))


## DecisionTree and GridSearchCV


In [1]:
from sklearn.tree import DecisionTreeRegressor
tree=DecisionTreeRegressor(min_samples_leaf = 0.0001)
tree.fit(X_train,y_train)
y_pred_tree = tree.predict(X_test)
print('R-squared score (training): {:.3f}'.format(tree.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(tree.score(X_test, y_test)))

In [1]:
params = {'max_depth' : [4, 5, 6 , 7 , 8 , 9  ,10 , 11],
          'min_samples_leaf' :[0.0001 , 0.001 , 0.01 , 0.1 , 1]
          }

In [1]:
grid = GridSearchCV(tree,params , n_jobs= -1 , cv = 5)
grid.fit(X_train,y_train)
grid_pred = tree.predict(X_test)

In [1]:
print('R-squared score (training): {:.3f}'.format(grid.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(grid.score(X_test, y_test)))

## RandomForestRegressor and GridSearchCV


In [1]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(max_depth = 9)
regressor.fit(X_train,y_train)
reg_pred = regressor.predict(X_test)

In [1]:
print('R-squared score (training): {:.3f}'.format(regressor.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(regressor.score(X_test, y_test)))

In [1]:
params_tree = {'max_depth' : [4, 5, 6 , 7 , 8 , 9  ,10 , 11],
          'min_samples_leaf' :[0.0001 , 0.001 , 0.01 , 0.1 , 1]
          }

In [1]:
regressor_grid = GridSearchCV(regressor , params_tree , cv = 5 , n_jobs= -1)
regressor_grid.fit(X_train,y_train)
regressor_grid.predict(X_test)

In [1]:
print('R-squared score (training): {:.3f}'.format(regressor_grid.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(regressor_grid.score(X_test, y_test)))