In [3]:
import pandas as pd
import urllib.request

#define the correct URL for the raw CSV file
csv_url = "https://raw.githubusercontent.com/Stravl/Stravl-Data/main/Stravl_Travel_Preference_Data.csv"
csv_file_path = "Stravl_Travel_Preference_Data.csv"

#download the CSV file
print("Downloading dataset...")
urllib.request.urlretrieve(csv_url, csv_file_path)
print(f"Download complete. File saved as: {csv_file_path}")

df = pd.read_csv(csv_file_path)

print("Preview of the dataset:")
print(df.head())

Downloading dataset...
Download complete. File saved as: Stravl_Travel_Preference_Data.csv
Preview of the dataset:
                                 id      form_a form_b form_c  \
0  22871c2cb8ee11ec9a19fefb5d5b7ead  ['0', '1']  ['1']  ['2']   
1  89b7022ab8f011ec9a19fefb5d5b7ead  ['0', '1']  ['3']  ['2']   
2  c64f6c36b8f011ec9a19fefb5d5b7ead       ['1']  ['1']  ['2']   
3  1eb6079ab8f111ec9a19fefb5d5b7ead       ['1']  ['1']  ['2']   
4  285e9852b8f111ec9a19fefb5d5b7ead       ['1']  ['0']  ['1']   

                           form_f                form_g form_h form_i form_j  \
0            ['0', '1', '2', '7']       ['0', '2', '4']  ['1']  ['1']  ['1']   
1  ['0', '3', '4', '5', '6', '7']  ['0', '1', '2', '4']  ['1']  ['1']  ['1']   
2       ['0', '1', '3', '4', '7']  ['0', '2', '4', '7']  ['1']  ['1']  ['1']   
3                           ['7']                 ['4']  ['1']  ['2']  ['1']   
4                           ['2']       ['2', '3', '4']  ['0']  ['1']  ['0']   

  form_k  ...

In [4]:
import requests

#destination file
url = "https://raw.githubusercontent.com/Stravl/Stravl-Data/main/destination_ids.txt"

#fetch the file content
response = requests.get(url)
response.raise_for_status()  

#load the destination mappings
destination_mapping = {i: line.strip() for i, line in enumerate(response.text.splitlines())}

# # Load destination mappings
# destination_file = r"C:\Users\sithu\OneDrive\Desktop\Stravl-Data-main\destination_ids.txt"
# with open(destination_file, "r", encoding="utf-8") as f:
#     destination_mapping = {i: line.strip() for i, line in enumerate(f.readlines())}

In [7]:
import ast

#decode swipe data
def decode_swipes(swipe_data):
    if pd.isna(swipe_data) or swipe_data == "[]":
        return []
    indices = ast.literal_eval(swipe_data)
    return [destination_mapping.get(i, f"Unknown({i})") for i in indices]

df['yes_swipes'] = df['yes_swipes'].apply(decode_swipes)
df['no_swipes'] = df['no_swipes'].apply(decode_swipes)
df['maybe_swipes'] = df['maybe_swipes'].apply(decode_swipes)

print("Decoding completed.")

Decoding completed.


In [9]:
print(df.columns)

Index(['id', 'form_a', 'form_b', 'form_c', 'form_f', 'form_g', 'form_h',
       'form_i', 'form_j', 'form_k', 'form_r', 'form_rr', 'yes_swipes',
       'no_swipes', 'maybe_swipes', 'Model', 'Retrieval', 'DynaMatch',
       'Rating_0', 'Rating_1', 'Rating_2', 'Rating_3', 'Rating_4', 'Rating_5',
       'Rating_6', 'Rating_7', 'Rating_8', 'Rating_9', 'Rec_0', 'Rec_1',
       'Rec_2', 'Rec_3', 'Rec_4', 'Rec_5', 'Rec_6', 'Rec_7', 'Rec_8', 'Rec_9'],
      dtype='object')


In [11]:
import ast
import pandas as pd

#form mappings
form_mappings = {
    "form_a": {0: "0-19", 1: "20-39", 2: "40-59", 3: "60+"},
    "form_b": {0: "$0-$49", 1: "$50-$99", 2: "$100-$249", 3: "$300+"},
    "form_c": {0: "Winter", 1: "Spring", 2: "Summer", 3: "Fall"},
    "form_f": {0: "Beach", 1: "Adventure", 2: "Nature", 3: "Culture", 4: "Nightlife", 5: "History", 6: "Shopping", 7: "Cuisine"},
    "form_g": {0: "Urban", 1: "Rural", 2: "Sea", 3: "Mountain", 4: "Lake", 5: "Desert", 6: "Plains", 7: "Jungle"},
    "form_h": {0: "Chill & Relaxed", 1: "Balanced", 2: "Active"},
    "form_i": {0: "Very Safety Conscious", 1: "Balanced", 2: "Ready for Anything"},
    "form_j": {0: "Off the Beaten Path", 1: "Classic Spot", 2: "Mainstream & Trendy"},
    "form_r": {0: "Anywhere", 1: "Specific Regions"},
}

form_rr_mapping = {
    "e": "Europe", "n": "N. America", "c": "Caribbean", "a": "Asia",
    "s": "S. America", "m": "Mid. East", "f": "Africa", "o": "Oceania"
}

#function to decode general form responses
def decode_form_response(value, mapping):
    if pd.isna(value):
        return None  # Handle missing values
    
    #if the value is already a list, decode each item
    if isinstance(value, list):
        return [mapping.get(int(i), f"Unknown({i})") for i in value]
    
    #if value is a string that looks like a list 
    if isinstance(value, str) and value.startswith("["):
        try:
            indices = ast.literal_eval(value)
            return [mapping.get(int(i), f"Unknown({i})") for i in indices]
        except (ValueError, SyntaxError): 
            return f"Invalid list format: {value}"
    
    try:
        return mapping.get(int(value), f"Unknown({value})")
    except (ValueError, TypeError): 
        return f"Invalid value: {value}"

#function to decode form_rr separately
def decode_form_rr(value, mapping):
    if pd.isna(value):
        return None  # Handle missing values
    
    #if it's a string representing multiple values (e.g., "[n,s]")
    if isinstance(value, str) and value.startswith("["):
        try:
            indices = ast.literal_eval(value)
            return [mapping.get(i, f"Unknown({i})") for i in indices]
        except (ValueError, SyntaxError):  # Handle parsing errors
            return f"Invalid list format: {value}"
    
    return mapping.get(value, f"Unknown({value})")

#apply decoding to form columns (excluding form_rr)
for column, mapping in form_mappings.items():
    if column in df.columns:
        df[column + "_decoded"] = df[column].apply(lambda x: decode_form_response(x, mapping))

#apply decoding to form_rr separately
if "form_rr" in df.columns:
    df["form_rr_decoded"] = df["form_rr"].apply(lambda x: decode_form_rr(x, form_rr_mapping))

# Print the updated DataFrame
print(df)

                                     id                form_a  \
0      22871c2cb8ee11ec9a19fefb5d5b7ead            ['0', '1']   
1      89b7022ab8f011ec9a19fefb5d5b7ead            ['0', '1']   
2      c64f6c36b8f011ec9a19fefb5d5b7ead                 ['1']   
3      1eb6079ab8f111ec9a19fefb5d5b7ead                 ['1']   
4      285e9852b8f111ec9a19fefb5d5b7ead                 ['1']   
...                                 ...                   ...   
80296  61b5e56ed28211edadc00e96bc2eb5a0  ['0', '1', '2', '3']   
80297  259dade6d28211edb561563501293886  ['0', '1', '2', '3']   
80298  53a67fecd28211ed8ef3560e516905ce  ['0', '1', '2', '3']   
80299  5e9fea64d28211ed8ef3560e516905ce  ['0', '1', '2', '3']   
80300  7315bb72d28211ed8ef3560e516905ce  ['0', '1', '2', '3']   

                     form_b form_c                          form_f  \
0                     ['1']  ['2']            ['0', '1', '2', '7']   
1                     ['3']  ['2']  ['0', '3', '4', '5', '6', '7']   
2        

In [13]:
df.to_csv("decoded_data.csv", index=False)
df = pd.read_csv("decoded_data.csv")

In [15]:
#handling missing values
df.fillna("Unknown", inplace=True) 

df_encoded = pd.get_dummies(df, drop_first=True)

  df.fillna("Unknown", inplace=True)


MemoryError: Unable to allocate 6.01 GiB for an array with shape (80300, 80301) and data type bool

In [17]:
import pandas as pd

#load your dataset
df = pd.read_csv("decoded_data.csv")

#categorical columns to apply One-Hot Encoding
categorical_columns = ["form_a", "form_b", "form_c", "form_f_decoded", "form_g_decoded", 
                        "form_h", "form_i", "form_j", "form_r"]

#apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

df_encoded.to_csv("encoded_data.csv", index=False)

print(df_encoded.head())

                                 id                          form_f  \
0  22871c2cb8ee11ec9a19fefb5d5b7ead            ['0', '1', '2', '7']   
1  89b7022ab8f011ec9a19fefb5d5b7ead  ['0', '3', '4', '5', '6', '7']   
2  c64f6c36b8f011ec9a19fefb5d5b7ead       ['0', '1', '3', '4', '7']   
3  1eb6079ab8f111ec9a19fefb5d5b7ead                           ['7']   
4  285e9852b8f111ec9a19fefb5d5b7ead                           ['2']   

                 form_g form_k form_rr  \
0       ['0', '2', '4']  ['2']     NaN   
1  ['0', '1', '2', '4']  ['1']     NaN   
2  ['0', '2', '4', '7']  ['0']     NaN   
3                 ['4']  ['1']     NaN   
4       ['2', '3', '4']  ['0']     NaN   

                                          yes_swipes  \
0  ['Dubai, United Arab Emirates', 'Geneva, Switz...   
1           ['Luxor, Egypt', 'Utrecht, Netherlands']   
2  ['Mexico City, Mexico', 'Faro, Portugal', 'McC...   
3  ['Versailles, France', 'Copenhagen, Denmark', ...   
4  ['Big Sky, Montana, United States', '

In [19]:
df = pd.read_csv("encoded_data.csv")

#sample 10,000 rows from the dataset
df_sampled = df.sample(n=10000, random_state=1026)  

df_sampled.to_csv("sampled_data1.csv", index=False)

print(df_sampled.head())

  df = pd.read_csv("encoded_data.csv")


                                     id                     form_f  \
77479  cd3dc0b6ca7811ed92b90a6e5a3aa765  ['0', '3', '4', '6', '7']   
20044  0fccaf90b93211ec8d48727d020e48c7            ['1', '2', '5']   
47213  58aeeee4b99111ec82ab5a02b93b35bb                 ['5', '7']   
21404  0660d77eb93711ec8d48727d020e48c7       ['0', '3', '4', '5']   
76072  db8ca680c02f11edbb7622d2271a10f0            ['1', '2', '3']   

                               form_g form_k form_rr  \
77479                 ['0', '2', '4']  ['2']   ['a']   
20044                 ['1', '3', '4']  ['1']     NaN   
47213                      ['1', '4']  ['1']     NaN   
21404                      ['0', '2']  ['1']     NaN   
76072  ['0', '1', '2', '3', '4', '7']  ['1']     NaN   

                             yes_swipes  \
77479  ['Tokyo, Japan', 'Kyoto, Japan']   
20044                                []   
47213          ['Newcastle, Australia']   
21404                                []   
76072                      

In [21]:
print(df.columns)

Index(['id', 'form_f', 'form_g', 'form_k', 'form_rr', 'yes_swipes',
       'no_swipes', 'maybe_swipes', 'Model', 'Retrieval',
       ...
       'form_g_decoded_['Urban', 'Sea']', 'form_g_decoded_['Urban']',
       'form_h_['1']', 'form_h_['2']', 'form_i_['1']', 'form_i_['2']',
       'form_j_['0']', 'form_j_['1']', 'form_j_['2']', 'form_r_['1']'],
      dtype='object', length=576)


In [23]:
from sklearn.model_selection import train_test_split

#split into 80% training and 20% testing
train_data, test_data = train_test_split(df_sampled, test_size=0.2, random_state=1026)

#split the training data into 80% training and 20% validation, which is 10% of the original data
train_data, val_data = train_test_split(train_data, test_size=0.125, random_state=1026)

print("Training data shape:", train_data.shape)
print("Validation data shape:", val_data.shape)
print("Test data shape:", test_data.shape)

Training data shape: (7000, 576)
Validation data shape: (1000, 576)
Test data shape: (2000, 576)


In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("sampled_data1.csv")

#encode target labels (rec_0 to rec_4)
label_encoders = {}
for col in ["Rec_0", "Rec_1", "Rec_2", "Rec_3", "Rec_4"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

#select feature columns (excluding rec_0 to rec_4)
feature_columns = [col for col in df.columns if col not in ["Rec_0", "Rec_1", "Rec_2", "Rec_3", "Rec_4"]]
X = df[feature_columns]
y = df[["Rec_0", "Rec_1", "Rec_2", "Rec_3", "Rec_4"]]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize RandomForest with MultiOutputClassifier
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
model = MultiOutputClassifier(base_model)

# Train the model
print("Training the model...")
model.fit(X_train, y_train)
print("Training complete!")

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = np.mean([accuracy_score(y_test[col], y_pred[:, i]) for i, col in enumerate(y.columns)])
print(f"Overall Accuracy: {accuracy * 100:.2f}%")

# Generate classification reports for each recommendation column
for i, col in enumerate(y.columns):
    print(f"Classification Report for {col}:")
    print(classification_report(y_test[col], y_pred[:, i]))


Training the model...


ValueError: could not convert string to float: '7551cfd6b9e511eca62e926813093b27'

In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv("sampled_data1.csv")

# Fill missing values with "Unknown" (already handled earlier)
df.fillna("Unknown", inplace=True)

# Assuming 'Rec_0' is the target variable we want to predict
# Extract features and target column
features = df.drop(columns=['Rec_0', 'id'])  # Drop 'Rec_0' and 'id' columns
target = df['Rec_0']  # Target is 'Rec_0'

# Apply one-hot encoding to the features if not done already
features_encoded = pd.get_dummies(features, drop_first=True)

# Encoding the target variable (Rec_0) as it might have string values
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

# Split the data into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target_encoded, test_size=0.2, random_state=1026)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=1026)

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Get the unique classes from the label encoder
unique_classes = label_encoder.classes_

# Ensure that y_test is a NumPy array
class_labels = np.unique(y_test)

# Generate the classification report with the correct labels
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred, labels=class_labels, target_names=class_labels))

  df.fillna("Unknown", inplace=True)


NameError: name 'accuracy' is not defined

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Create a heatmap to visualize the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)

# Set labels for the axes
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')

# Show the plot
plt.show()