In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from data import DATA_DIR
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


In [2]:
data = pd.read_csv(os.path.join(DATA_DIR, 'dummy.csv'))
data


Unnamed: 0,Location,Device,Device_OS,Trip_description,Location_Trip,Budget,Booked
0,New York,Laptop,Android,Weekend getaway to Los Angeles,USA,3800,Booked
1,Tunisia,Phone,iOS,Explore the wonders of Berlin,Germany,2500,Not Booked
2,Tokyo,Tablet,Android,Weekend getaway to Beijing,China,4200,Booked
3,London,Laptop,Windows,Explore the wonders of Rio de Janeiro,Brazil,1800,Booked
4,Mumbai,Phone,Android,Weekend getaway to Dubai,United Arab Emirates,4500,Not Booked
5,Paris,Tablet,iOS,Explore the wonders of Cape Town,South Africa,3000,Booked
6,Singapore,Laptop,Android,Weekend getaway to Melbourne,Australia,2000,Booked
7,Berlin,Phone,iOS,Explore the wonders of Istanbul,Turkey,4800,Not Booked
8,Madrid,Tablet,Windows,Weekend getaway to Amsterdam,Netherlands,3200,Booked
9,Jakarta,Laptop,Android,Explore the wonders of Prague,Czech Republic,1500,Booked


In [3]:
X = data.drop("Booked", axis=1)
y = data["Booked"]

categorical_preprocessor = OneHotEncoder(handle_unknown='ignore')

numerical_preprocessor = StandardScaler()

tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))


preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_preprocessor, ["Device", "Device_OS"]),
        ("text", tfidf_vectorizer, "Trip_description"),
        ("num", numerical_preprocessor, ["Budget"]) 
    ]
)


In [4]:
model = LogisticRegression(solver='liblinear')

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
pipeline.fit(X_train, y_train)

In [7]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5


In [8]:
new_data = {
    "Location": "Rome",
    "Device": "Tablet",
    "Device_OS": "Android",
    "Trip_description": "Explore the wonders of Barcelona, Spain",
    "Location_Trip": "Barcelona, Spain",
    "Budget": 2800
}

new_data = pd.DataFrame([new_data])

booking_prediction = pipeline.predict(new_data)[0]
print("Predicted Booking:", booking_prediction)

Predicted Booking: Booked
