#Applying Linear Regression to estimate the cost 

In [51]:
%pip install --upgrade scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [1]:
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd 
import numpy as np



Loading csv and applying One hot encoding

In [2]:
df=pd.read_csv("../data/cleaned_destinations.csv")
df= pd.get_dummies(df,columns=['category','city','best_season'])


In [3]:
vectorizer= CountVectorizer(token_pattern=r"(?u)\b\w+\b")
activity_matrix = vectorizer.fit_transform(df['popular_activities'])
activity_df = pd.DataFrame(activity_matrix.toarray(), columns=vectorizer.get_feature_names_out())

Combine popular_activites in main dataframe

In [4]:
df = pd.concat([df.drop('popular_activities', axis=1), activity_df], axis=1)

In [5]:
X= df.drop(['estimated_cost','place_name','description'],axis=1)
y = np.log(df['estimated_cost']) 

X.shape


(202, 95)

In [6]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size= 0.2 ,random_state=42)

model=LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
y_pred_original_scale = np.exp(y_pred) 
print("Model accuracy: ",model.score(X_test,y_test))


Model accuracy:  0.35576396680747957


Using Bagging to check the models accuracy

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import joblib 


df = pd.read_csv("../data/cleaned_destinations.csv")

# Drop non-numeric columns including popular_activities
categorical_cols = ['category', 'city', 'best_season']
X = df.drop(['place_name', 'description', 'estimated_cost', 'popular_activities'], axis=1)

# One-hot encode categorical columns
X = pd.get_dummies(X, columns=categorical_cols)

X = X.astype(int)
y = df['estimated_cost']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Bagging regressor with decision tree base
base_model = DecisionTreeRegressor(max_depth=5, random_state=42)
bagging_model = BaggingRegressor(
    
    n_estimators=50,
    random_state=42
)

bagging_model.fit(X_train, y_train)

y_pred = bagging_model.predict(X_test)

print("Predicted values:", y_pred)

joblib.dump(bagging_model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X.columns.tolist(), "features.pkl")


Predicted values: [2133.24       3746.06       3168.19333333 2214.6        1971.52
 1300.04666667 3660.21333333 2682.84857143 1902.34       3782.8
 2344.128      2765.936      1971.52       2841.8        4387.8
 3566.42857143 3345.50047619 3100.956      2432.08       2878.27428571
 3743.78       2743.34       1735.14333333 3346.98       1992.43
 2482.59428571 3855.42       3611.23714286 1300.         3635.44
 2399.18       4121.65428571 3384.27       3582.03714286 2264.48
 5140.         2742.64       1685.46733333 4387.8        3924.9
 3724.81333333]


['features.pkl']

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)   # Manually calculate RMSE
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R^2 Score: {r2:.4f}")


Model Performance:
MAE: 552.43
RMSE: 799.55
R^2 Score: 0.4968


In [9]:
# import pandas as pd
# from sklearn.preprocessing import StandardScaler
# import joblib

# # Load your model, scaler, and data
# df = pd.read_csv("../data/cleaned_destinations.csv")
# bagging_model = joblib.load("../models/model.pkl")      
# scaler = joblib.load("../models/scaler.pkl")             
# model_feature_columns = joblib.load("../models/features.pkl")  # List of columns used during training

# # Reference lists for inputs
# categories = sorted(df['category'].unique())
# cities = sorted(df['city'].unique())
# seasons = sorted(df['best_season'].unique())

# # Function to safely get user selection
# def select_from_list(prompt, options):
#     print(f"\n{prompt}")
#     for i, val in enumerate(options, 1):
#         print(f"{i}. {val}")
#     while True:
#         try:
#             choice = int(input("Enter number: "))
#             if 1 <= choice <= len(options):
#                 return options[choice - 1]
#             else:
#                 print("Invalid number. Try again.")
#         except ValueError:
#             print("Please enter a valid number.")

# # Get input
# def get_user_input():
#     try:
#         duration_days = int(input("Enter duration of trip in days: "))
#     except ValueError:
#         duration_days = 3  # Default fallback
#     category = select_from_list("Select Category:", categories)
#     city = select_from_list("Select City:", cities)
#     season = select_from_list("Select Best Season:", seasons)
#     return duration_days, category, city, season

# # Prepare features
# def prepare_features(duration, category, city, season):
#     features = {col: 0 for col in model_feature_columns}
#     features['duration_days'] = duration
#     for col in [f"category_{category}", f"city_{city}", f"best_season_{season}"]:
#         if col in features:
#             features[col] = 1
#     df_input = pd.DataFrame([features])
#     X_scaled = scaler.transform(df_input)
#     return X_scaled

# # Run prediction
# if __name__ == "__main__":
#     duration, category, city, season = get_user_input()
#     X_user = prepare_features(duration, category, city, season)
#     predicted_cost = bagging_model.predict(X_user)[0]
#     print(f"\n✅ Estimated Trip Cost: NPR {predicted_cost:.2f}")
