In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
merged_df = pd.read_csv(r"F:\Hero Vired - Data Science and Business Analytics Course\Capstone Project\Problem Statements-20240601\Airbnb\AirBnB Project\final_df.xls")

In [3]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318225 entries, 0 to 318224
Data columns (total 38 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   listing_id                   318225 non-null  int64  
 1   latitude                     318225 non-null  float64
 2   longitude                    318225 non-null  float64
 3   property_type                318225 non-null  object 
 4   room_type                    318225 non-null  object 
 5   accomodates                  318225 non-null  int64  
 6   bedrooms                     318225 non-null  float64
 7   beds                         318225 non-null  float64
 8   host_id                      318225 non-null  int64  
 9   bathroom_number              318225 non-null  float64
 10  bathroom_type                318225 non-null  object 
 11  Wifi                         318225 non-null  int64  
 12  Essentials                   318225 non-null  int64  
 13 

### Random Forest Regressor-Final Model

In [4]:
# Drop the original 'date' and 'host_since' columns
#merged_df = merged_df.drop(columns=['date', 'host_since'])

# Handle missing values (example: fill with median, or drop)
merged_df.fillna(merged_df.median(numeric_only=True), inplace=True)

# Specify the columns to keep
columns_to_keep = [
    'longitude', 'latitude', 'property_type', 'room_type', 'accomodates',                  
    'bedrooms', 'beds', 'bathroom_number', 'top_10_amenities_count', 
    'minimum_nights', 'maximum_nights', 'price'
]

# Select only the specified columns
merged_df = merged_df[columns_to_keep]

# One-hot encode categorical columns without dropping the first category
categorical_columns = ['property_type', 'room_type']
merged_df = pd.get_dummies(merged_df, columns=categorical_columns, drop_first=False)

# Ensure all required columns are present
required_columns = [
    'longitude', 'latitude', 'accomodates', 'bedrooms', 'beds', 'bathroom_number', 
    'top_10_amenities_count', 'minimum_nights', 'maximum_nights', 'property_type_Casa particular', 
    'property_type_Castle', 'property_type_Entire condominium (condo)', 'property_type_Entire cottage', 
    'property_type_Entire guest suite', 'property_type_Entire guesthouse', 'property_type_Entire loft', 
    'property_type_Entire rental unit', 'property_type_Entire residential home', 
    'property_type_Entire serviced apartment', 'property_type_Entire townhouse', 
    'property_type_Entire vacation home', 'property_type_Entire villa', 'property_type_Houseboat', 
    'property_type_Private room', 'property_type_Private room in bed and breakfast', 
    'property_type_Private room in boat', 'property_type_Private room in casa particular', 
    'property_type_Private room in condominium (condo)', 'property_type_Private room in guest suite', 
    'property_type_Private room in guesthouse', 'property_type_Private room in loft', 
    'property_type_Private room in religious building', 'property_type_Private room in rental unit', 
    'property_type_Private room in residential home', 'property_type_Private room in serviced apartment', 
    'property_type_Private room in townhouse', 'property_type_Private room in villa', 'property_type_Room in aparthotel', 
    'property_type_Room in boutique hotel', 'property_type_Room in hotel', 'property_type_Shared room in bed and breakfast', 
    'property_type_Shared room in casa particular', 'property_type_Shared room in loft', 
    'property_type_Shared room in residential home', 'property_type_Tent', 'property_type_Tiny house', 
    'property_type_Yurt', 'room_type_Entire home/apt', 'room_type_Hotel room', 'room_type_Private room', 
    'room_type_Shared room'
]

for col in required_columns:
    if col not in merged_df.columns:
        merged_df[col] = 0

# Reorder columns to match model's expectations
merged_df = merged_df[required_columns + ['price']]

# Select features and target variable
target = 'price'
features = merged_df.columns[merged_df.columns != target]

X = merged_df[features]
y = merged_df[target]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Output shapes of the splits to verify
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)


X_train shape: (222757, 51)
X_val shape: (47734, 51)
X_test shape: (47734, 51)
y_train shape: (222757,)
y_val shape: (47734,)
y_test shape: (47734,)


In [5]:
# Train Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred = rf_model.predict(X_val)

# Evaluate model
rf_mae = mean_absolute_error(y_val, y_val_pred)
rf_mse = mean_squared_error(y_val, y_val_pred)
rf_r2 = r2_score(y_val, y_val_pred)

print("Random Forest Regressor Performance:")
print("MAE:", rf_mae)
print("MSE:", rf_mse)
print("R²:", rf_r2)

Random Forest Regressor Performance:
MAE: 10.95899213360605
MSE: 1999.7854788781265
R²: 0.9483493316882682


In [6]:
# Predict on test set
y_test_pred = rf_model.predict(X_test)

# Evaluate model on test set
test_rf_mae = mean_absolute_error(y_test, y_test_pred)
test_rf_mse = mean_squared_error(y_test, y_test_pred)
test_rf_r2 = r2_score(y_test, y_test_pred)

print("Test Set Performance for Random Forest Regressor:")
print("MAE:", test_rf_mae)
print("MSE:", test_rf_mse)
print("R²:", test_rf_r2)

Test Set Performance for Random Forest Regressor:
MAE: 10.989363787822416
MSE: 2006.9356582091523
R²: 0.9274977099756809


In [7]:
# Save the trained model to a file
model_path = r"F:\Hero Vired - Data Science and Business Analytics Course\Capstone Project\Problem Statements-20240601\Airbnb\AirBnB Project\ML-MODEL-DEPLOYMENT-USING-FLASK\random_forest_model.pkl"
scaler_path = r"F:\Hero Vired - Data Science and Business Analytics Course\Capstone Project\Problem Statements-20240601\Airbnb\AirBnB Project\ML-MODEL-DEPLOYMENT-USING-FLASK\scaler.pkl"
joblib.dump(rf_model, model_path)
joblib.dump(scaler, scaler_path)


['F:\\Hero Vired - Data Science and Business Analytics Course\\Capstone Project\\Problem Statements-20240601\\Airbnb\\AirBnB Project\\ML-MODEL-DEPLOYMENT-USING-FLASK\\scaler.pkl']

In [8]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318225 entries, 0 to 318224
Data columns (total 52 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   longitude                                          318225 non-null  float64
 1   latitude                                           318225 non-null  float64
 2   accomodates                                        318225 non-null  int64  
 3   bedrooms                                           318225 non-null  float64
 4   beds                                               318225 non-null  float64
 5   bathroom_number                                    318225 non-null  float64
 6   top_10_amenities_count                             318225 non-null  int64  
 7   minimum_nights                                     318225 non-null  int64  
 8   maximum_nights                                     318225 non-null  int64 

In [9]:
pd.set_option('display.max_columns', None)
merged_df

Unnamed: 0,longitude,latitude,accomodates,bedrooms,beds,bathroom_number,top_10_amenities_count,minimum_nights,maximum_nights,property_type_Casa particular,property_type_Castle,property_type_Entire condominium (condo),property_type_Entire cottage,property_type_Entire guest suite,property_type_Entire guesthouse,property_type_Entire loft,property_type_Entire rental unit,property_type_Entire residential home,property_type_Entire serviced apartment,property_type_Entire townhouse,property_type_Entire vacation home,property_type_Entire villa,property_type_Houseboat,property_type_Private room,property_type_Private room in bed and breakfast,property_type_Private room in boat,property_type_Private room in casa particular,property_type_Private room in condominium (condo),property_type_Private room in guest suite,property_type_Private room in guesthouse,property_type_Private room in loft,property_type_Private room in religious building,property_type_Private room in rental unit,property_type_Private room in residential home,property_type_Private room in serviced apartment,property_type_Private room in townhouse,property_type_Private room in villa,property_type_Room in aparthotel,property_type_Room in boutique hotel,property_type_Room in hotel,property_type_Shared room in bed and breakfast,property_type_Shared room in casa particular,property_type_Shared room in loft,property_type_Shared room in residential home,property_type_Tent,property_type_Tiny house,property_type_Yurt,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,price
0,4.398631,51.218575,2,1.0,1.0,1.0,9,1,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,165.0
1,4.398631,51.218575,2,1.0,1.0,1.0,9,1,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,150.0
2,4.398631,51.218575,2,1.0,1.0,1.0,9,1,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,165.0
3,4.398631,51.218575,2,1.0,1.0,1.0,9,1,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,165.0
4,4.398631,51.218575,2,1.0,1.0,1.0,9,1,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,165.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318220,4.399620,51.214580,3,1.0,1.0,1.5,8,3,365,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,150.0
318221,4.399620,51.214580,3,1.0,1.0,1.5,8,3,365,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,150.0
318222,4.399620,51.214580,3,1.0,1.0,1.5,8,3,365,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,150.0
318223,4.399620,51.214580,3,1.0,1.0,1.5,8,3,365,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,150.0


In [22]:
# Define a function to take input from the user
def get_user_input():
    user_data = {
        'latitude': float(input("Enter latitude: ")),
        'longitude': float(input("Enter longitude: ")),
        'property_type': input("Enter property type: "),
        'room_type': input("Enter room type: "),
        'accommodates': int(input("Enter number of accommodates: ")),
        'bedrooms': float(input("Enter number of bedrooms: ")),
        'beds': float(input("Enter number of beds: ")),
        'bathroom_number': float(input("Enter number of bathrooms: ")),
        'top_10_amenities_count': int(input("Enter count of top 10 amenities: ")),
        'minimum_nights': int(input("Enter minimum nights: ")),
        'maximum_nights': int(input("Enter maximum nights: ")),
            }
    return user_data

# Function to preprocess the user input
def preprocess_user_input(user_data, scaler, feature_names):
    # Convert user data to DataFrame
    user_df = pd.DataFrame([user_data])
    
    # Handle categorical variables
    categorical_columns = ['property_type', 'room_type']
    user_df_encoded = pd.get_dummies(user_df, columns=categorical_columns, drop_first=True)
    
    # Align columns of user_df_encoded with the training data columns
    for col in feature_names:
        if col not in user_df_encoded.columns:
            user_df_encoded[col] = 0

    user_df_encoded = user_df_encoded[feature_names]
    
    # Scale the user input data
    user_df_scaled = scaler.transform(user_df_encoded)
    
    return user_df_scaled

# Predict price based on user input
def predict_price(user_data, model, scaler, feature_names):
    user_df_scaled = preprocess_user_input(user_data, scaler, feature_names)
    
    # Predict the price
    price_prediction = model.predict(user_df_scaled)
    return price_prediction[0]

# Get user input
user_input = get_user_input()

# Predict the price
predicted_price = predict_price(user_input, rf_model, scaler, features)

print(f"Predicted price for the given inputs: ${predicted_price:.2f}")

Enter latitude: 4.398631
Enter longitude: 51.218575
Enter property type: Room in boutique hotel
Enter room type: Hotel room
Enter number of accommodates: 2
Enter number of bedrooms: 1
Enter number of beds: 1
Enter number of bathrooms: 1
Enter count of top 10 amenities: 9
Enter minimum nights: 1
Enter maximum nights: 60
Predicted price for the given inputs: $97.45


In [None]:
#!python --version


In [None]:
#!pip uninstall -y numpy

In [None]:
#!pip uninstall -y scikit-learn

In [None]:
#!pip install numpy==1.26.4

In [None]:
#!pip install scikit-learn==1.3.2

In [None]:
#!pip install numpy==1.26.4 --user
