In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file 

In [20]:
df = pd.read_csv('Airbnb_Open_Data.csv')



In [21]:
#summary of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              102599 non-null  int64  
 1   NAME                            102349 non-null  object 
 2   host id                         102599 non-null  int64  
 3   host_identity_verified          102310 non-null  object 
 4   host name                       102193 non-null  object 
 5   neighbourhood group             102570 non-null  object 
 6   neighbourhood                   102583 non-null  object 
 7   lat                             102591 non-null  float64
 8   long                            102591 non-null  float64
 9   country                         102067 non-null  object 
 10  country code                    102468 non-null  object 
 11  instant_bookable                102494 non-null  object 
 12  cancellation_pol

In [22]:
#to check the datatype of each column
df.dtypes

id                                  int64
NAME                               object
host id                             int64
host_identity_verified             object
host name                          object
neighbourhood group                object
neighbourhood                      object
lat                               float64
long                              float64
country                            object
country code                       object
instant_bookable                   object
cancellation_policy                object
room type                          object
Construction year                 float64
price                              object
service fee                        object
minimum nights                    float64
number of reviews                 float64
last review                        object
reviews per month                 float64
review rate number                float64
calculated host listings count    float64
availability 365                  

In [23]:
#display the first few rows of data
df.head()

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,...,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,...,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,...,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,...,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,...,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,...,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [24]:
#number of rows and columns
df.shape

(102599, 26)

In [25]:
# Drop unnecessary columns
df.drop(['host id','NAME', 'host_identity_verified', 'host name', 'country', 'country code', 'license','last review'], axis=1, inplace=True)

In [26]:
#Remove duplicate rows based on all columns

df.drop_duplicates(inplace=True)
df.shape

(102058, 18)

In [27]:
#missing values
missing_values = df.isnull().sum()
missing_values

id                                    0
neighbourhood group                  29
neighbourhood                        16
lat                                   8
long                                  8
instant_bookable                    105
cancellation_policy                  76
room type                             0
Construction year                   214
price                               247
service fee                         273
minimum nights                      400
number of reviews                   183
reviews per month                 15818
review rate number                  319
calculated host listings count      319
availability 365                    448
house_rules                       51842
dtype: int64

In [28]:
df.dtypes


id                                  int64
neighbourhood group                object
neighbourhood                      object
lat                               float64
long                              float64
instant_bookable                   object
cancellation_policy                object
room type                          object
Construction year                 float64
price                              object
service fee                        object
minimum nights                    float64
number of reviews                 float64
reviews per month                 float64
review rate number                float64
calculated host listings count    float64
availability 365                  float64
house_rules                        object
dtype: object

In [29]:
num_columns = ['Construction year', 'minimum nights', 'number of reviews',
                     'reviews per month', 'review rate number', 'calculated host listings count',
                     'availability 365']
for i in num_columns:
    df[i].fillna(df[i].median(), inplace=True)

In [30]:
# For categorical columns, we can consider filling missing values with the mode
categorical_columns = ['neighbourhood group', 'neighbourhood','instant_bookable', 'cancellation_policy',
                       'room type', 'house_rules']
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [31]:
df['price'].info()


<class 'pandas.core.series.Series'>
Int64Index: 102058 entries, 0 to 102057
Series name: price
Non-Null Count   Dtype 
--------------   ----- 
101811 non-null  object
dtypes: object(1)
memory usage: 1.6+ MB


In [32]:
# Data Transformation
#df['price'] = df['price'].replace('$', '',regex=True).astype(float)
df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)

In [33]:
df = df.dropna(subset=['price','service fee'])


In [34]:
# Checking the dataset after handling missing values
df.isnull().sum(), df.shape

(id                                0
 neighbourhood group               0
 neighbourhood                     0
 lat                               8
 long                              8
 instant_bookable                  0
 cancellation_policy               0
 room type                         0
 Construction year                 0
 price                             0
 service fee                       0
 minimum nights                    0
 number of reviews                 0
 reviews per month                 0
 review rate number                0
 calculated host listings count    0
 availability 365                  0
 house_rules                       0
 dtype: int64,
 (101572, 18))

In [35]:
# Grouping by 'neighbourhood' and calculating the mean 'lat' and 'long'
mean_coords = df.groupby('neighbourhood')[['lat', 'long']].mean()

# Applying the mean coordinates to missing values
df = df.set_index('neighbourhood')
df['lat'].fillna(mean_coords['lat'], inplace=True)
df['long'].fillna(mean_coords['long'], inplace=True)
df.reset_index(inplace=True)

In [36]:
df.isnull().sum(), df.shape


(neighbourhood                     0
 id                                0
 neighbourhood group               0
 lat                               0
 long                              0
 instant_bookable                  0
 cancellation_policy               0
 room type                         0
 Construction year                 0
 price                             0
 service fee                       0
 minimum nights                    0
 number of reviews                 0
 reviews per month                 0
 review rate number                0
 calculated host listings count    0
 availability 365                  0
 house_rules                       0
 dtype: int64,
 (101572, 18))

In [37]:
#List the count of various room types avaliable with Airbnb
room_type_counts = df ['room type'].value_counts()
print("Count of Various Room Types Available:")
print(room_type_counts)

Count of Various Room Types Available:
Entire home/apt    53182
Private room       46080
Shared room         2197
Hotel room           113
Name: room type, dtype: int64


In [40]:
import numpy as np

# Selected numeric columns where I wanted to handle outliers
numeric_columns = ['price', 'service fee']

# Looping through each numeric column to calculate and handle outliers
for column_name in numeric_columns:
    
    # Calculated the 1st and 3rd quartiles (Q1 and Q3) to find the interquartile range (IQR)
    Q1 = np.percentile(df[column_name], 25, interpolation='midpoint')  # 25th percentile
    Q3 = np.percentile(df[column_name], 75, interpolation='midpoint')  # 75th percentile
    
    # Calculated the Interquartile Range (IQR) and established upper and lower bounds for outliers
    IQR = Q3 - Q1
    low_lim = Q1 - 1.5 * IQR
    up_lim = Q3 + 1.5 * IQR
    
    # Identified any data points outside the upper and lower bounds, classifying them as outliers
    outliers = df[(df[column_name] < low_lim) | (df[column_name] > up_lim)][column_name]
    
    # Replaced values below the lower limit and above the upper limit with their respective bounds
    df[column_name] = np.where(df[column_name] < low_lim, low_lim, df[column_name])
    df[column_name] = np.where(df[column_name] > up_lim, up_lim, df[column_name])

# Outliers were now addressed for the 'price' and 'service fee' columns


In [41]:
from sklearn.preprocessing import StandardScaler

# Initializing the StandardScaler
scaler = StandardScaler()

# Identifying the numeric columns to scale, ensuring outliers were already handled
numeric_columns = ['price', 'service fee']

# Fitting the scaler to the numeric columns and transforming the data
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# At this point, the 'price' and 'service fee' columns are now standardized (mean=0, variance=1)


In [42]:
# Selecting the numeric columns where outliers need to be handled
numeric_columns = ['price', 'service fee', 'minimum nights', 'number of reviews']

for column_name in numeric_columns:
    # Calculating the 1st quartile (Q1) and 3rd quartile (Q3) using the 'midpoint' interpolation method
    Q1 = np.percentile(df[column_name], 25, interpolation='midpoint')
    Q3 = np.percentile(df[column_name], 75, interpolation='midpoint')

    # Calculating the Interquartile Range (IQR)
    IQR = Q3 - Q1

    # Defining the lower and upper limit for outliers
    low_lim = Q1 - 1.5 * IQR
    up_lim = Q3 + 1.5 * IQR

    # Identifying outliers in the column
    outliers = df[(df[column_name] < low_lim) | (df[column_name] > up_lim)][column_name]

    # Replacing outliers with the respective lower or upper limit
    df[column_name] = np.where(df[column_name] < low_lim, low_lim, df[column_name])
    df[column_name] = np.where(df[column_name] > up_lim, up_lim, df[column_name])


In [43]:
# First, ensure any necessary libraries are imported
import pandas as pd

# Assuming 'df' is your dataframe
# Separate features (X) and target variable (y)
X = df.drop('price', axis=1)  # Dropping the 'price' column to keep it as the target variable
y = df['price']  # The 'price' column becomes the target variable

# Check the shape of X and y to confirm they have been split correctly
print("Shape of X (features):", X.shape)
print("Shape of y (target):", y.shape)


Shape of X (features): (101572, 17)
Shape of y (target): (101572,)


In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20)

In [47]:
# Check for None values in X_train and y_train
print("X_train is None:", X_train is None)
print("y_train is None:", y_train is None)

# Check the shapes of X_train and y_train
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Ensure y_train is a numeric type
print("y_train data types:\n", y_train.value_counts())

# Convert y_train to a numeric type if necessary
y_train = y_train.astype(int)

# If X_train contains categorical features, encode them
if X_train.select_dtypes(include=['object']).shape[1] > 0:
    X_train = pd.get_dummies(X_train)

# Re-check the shapes after any modifications
print("Modified X_train shape:", X_train.shape)

# Now try fitting the logistic regression model again
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C=1, penalty='l2', solver='liblinear', max_iter=200)
log_reg.fit(X_train, y_train)

# Coefficients and accuracy output as before
coefficients = log_reg.coef_
intercept = log_reg.intercept_
train_accuracy = log_reg.score(X_train, y_train)

print("Coefficients:", coefficients)
print("Intercept:", intercept)
print("Training Accuracy:", train_accuracy)


X_train is None: False
y_train is None: False
X_train shape: (81257, 17)
y_train shape: (81257,)
y_train data types:
 -1.264134    107
 0.626414    103
-0.434946    102
 1.298810    101
 1.232475    100
            ... 
 0.557064     43
-1.321423     42
 0.484698     42
-1.616916     42
-0.483190     41
Name: price, Length: 1151, dtype: int64
Modified X_train shape: (81257, 2204)
Coefficients: [[-3.63642650e-08 -6.53533238e-10  1.18666480e-09 ... -1.29614991e-14
   9.35830151e-17  4.40464601e-15]
 [ 8.14378606e-09  1.38875165e-14 -2.52133947e-14 ...  9.22551367e-20
  -9.85082476e-20 -2.18335091e-19]
 [-3.59038810e-08 -6.47138228e-10  1.17495661e-09 ...  3.20023401e-15
  -3.15468120e-15 -2.15505016e-15]]
Intercept: [-1.60461310e-11  3.40956451e-16 -1.58876809e-11]
Training Accuracy: 0.5765164847336229


In [63]:
from sklearn.tree import DecisionTreeClassifier

# Check for None values
print("X_train is None:", X_train is None)
print("y_train is None:", y_train is None)

# Check the shapes of X_train and y_train
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Ensure y_train is a numeric type
print("y_train data types:\n", y_train.value_counts())

# Convert y_train to a numeric type if necessary
y_train = y_train.astype(int)

# Encode categorical features in X_train, if any
if X_train.select_dtypes(include=['object']).shape[1] > 0:
    X_train = pd.get_dummies(X_train)

# Re-check the shape of X_train after modifications
print("Modified X_train shape:", X_train.shape)

# Fit the Decision Tree model
decision_tree = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=42)
decision_tree.fit(X_train, y_train)

# Output the training accuracy
train_accuracy = decision_tree.score(X_train, y_train)
print("Training Accuracy (Decision Tree):", train_accuracy)


X_train is None: False
y_train is None: False
X_train shape: (81257, 17)
y_train shape: (81257,)
y_train data types:
 -1.264134    109
-0.434946    106
 0.780190    101
 1.045530    101
 0.626414    100
            ... 
 0.484698     42
 1.380221     42
-1.616916     42
 0.448515     41
 0.897784     41
Name: price, Length: 1151, dtype: int64
Modified X_train shape: (81257, 2210)
Training Accuracy (Decision Tree): 0.9984493643624549


In [64]:
from sklearn.ensemble import RandomForestClassifier

# Check for None values
print("X_train is None:", X_train is None)
print("y_train is None:", y_train is None)

# Check the shapes of X_train and y_train
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Ensure y_train is a numeric type
print("y_train data types:\n", y_train.value_counts())

# Convert y_train to a numeric type if necessary
y_train = y_train.astype(int)

# Encode categorical features in X_train, if any
if X_train.select_dtypes(include=['object']).shape[1] > 0:
    X_train = pd.get_dummies(X_train)

# Re-check the shape of X_train after modifications
print("Modified X_train shape:", X_train.shape)

# Fit the Random Forest model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

# Output the training accuracy
train_accuracy = random_forest.score(X_train, y_train)
print("Training Accuracy (Random Forest):", train_accuracy)


X_train is None: False
y_train is None: False
X_train shape: (81257, 2210)
y_train shape: (81257,)
y_train data types:
  0    46858
 1    17303
-1    17096
Name: price, dtype: int64
Modified X_train shape: (81257, 2210)
Training Accuracy (Random Forest): 1.0


In [72]:
# Check the type of y_test
print("y_test data type:", y_test.dtype)

# Convert y_test to integer if it's not already
if not pd.api.types.is_integer_dtype(y_test):
    y_test = y_test.astype(int)

# Check the type of y_pred_tree
print("y_pred_tree data type:", y_pred_tree.dtype)

# Convert y_pred_tree to integer if it's not already
if not pd.api.types.is_integer_dtype(y_pred_tree):
    y_pred_tree = y_pred_tree.astype(int)

# Now proceed with evaluation metrics
accuracy_tree = accuracy_score(y_test, y_pred_tree)
precision_tree = precision_score(y_test, y_pred_tree, average='weighted')
recall_tree = recall_score(y_test, y_pred_tree, average='weighted')
f1_tree = f1_score(y_test, y_pred_tree, average='weighted')

print("\nDecision Tree Metrics:")
print(f"Accuracy: {accuracy_tree}")
print(f"Precision: {precision_tree}")
print(f"Recall: {recall_tree}")
print(f"F1-Score: {f1_tree}")

y_test data type: int32
y_pred_tree data type: int32

Decision Tree Metrics:
Accuracy: 0.9984248092542456
Precision: 0.9984290849437752
Recall: 0.9984248092542456
F1-Score: 0.9984243457666647


In [74]:
# Check for categorical features in X_test
categorical_cols = X_test.select_dtypes(include=['object']).columns
print("Categorical columns in X_test:", categorical_cols)

# If there are categorical columns, apply one-hot encoding
if len(categorical_cols) > 0:
    X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
    
# Make sure X_test has the same columns as X_train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Now, make predictions with the Random Forest model
y_pred_rf = random_forest.predict(X_test)

# Proceed with the evaluation metrics
print("y_test data type:", y_test.dtype)

# Convert y_test to integer if it's not already
if not pd.api.types.is_integer_dtype(y_test):
    y_test = y_test.astype(int)

# Convert y_pred_rf to integer if it's not already
if not pd.api.types.is_integer_dtype(y_pred_rf):
    y_pred_rf = y_pred_rf.astype(int)

# Now proceed with evaluation metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

# Display Random Forest evaluation results
print("\nRandom Forest Metrics:")
print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1-Score: {f1_rf}")


Categorical columns in X_test: Index(['neighbourhood', 'neighbourhood group', 'cancellation_policy',
       'room type', 'house_rules'],
      dtype='object')
y_test data type: int32

Random Forest Metrics:
Accuracy: 0.99857248338666
Precision: 0.9985759958575396
Recall: 0.99857248338666
F1-Score: 0.9985721101003486


In [75]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Check for categorical features in X_test
categorical_cols = X_test.select_dtypes(include=['object']).columns
print("Categorical columns in X_test:", categorical_cols)

# If there are categorical columns, apply one-hot encoding
if len(categorical_cols) > 0:
    X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
    X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
    
    # Align columns in X_test with X_train
    X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
else:
    X_train_encoded = X_train
    X_test_encoded = X_test

# Fit the Naïve Bayes model
naive_bayes = GaussianNB()
naive_bayes.fit(X_train_encoded, y_train)

# Make predictions
y_pred_nb = naive_bayes.predict(X_test_encoded)

# Check the type of y_test
print("y_test data type:", y_test.dtype)

# Convert y_test to integer if it's not already
if not pd.api.types.is_integer_dtype(y_test):
    y_test = y_test.astype(int)

# Convert y_pred_nb to integer if it's not already
if not pd.api.types.is_integer_dtype(y_pred_nb):
    y_pred_nb = y_pred_nb.astype(int)

# Now proceed with evaluation metrics
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

# Display Naïve Bayes evaluation results
print("\nNaïve Bayes Metrics:")
print(f"Accuracy: {accuracy_nb}")
print(f"Precision: {precision_nb}")
print(f"Recall: {recall_nb}")
print(f"F1-Score: {f1_nb}")


Categorical columns in X_test: Index([], dtype='object')
y_test data type: int32

Naïve Bayes Metrics:
Accuracy: 0.578734924932316
Precision: 0.3349341133364135
Recall: 0.578734924932316
F1-Score: 0.42430696635253423
