In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [3]:
# Load the dataset
data = pd.read_excel('./just__test.xlsx')

In [4]:
data.head()


Unnamed: 0,package name,description
0,Kalinchowk Tour,Kalinchowk is recognized as the snowy paradise...
1,Pokhara Tour,If you are an adventure seekers person and enj...
2,Pokhara Ultimate Sky Adventure,Experience the thrill of soaring through the o...
3,Pokhara Aerial Thrills and Chills,Take your adventure to new heights with this p...
4,Pokhara Canyon Explorer,Brace yourself for an adrenaline-filled advent...


In [5]:
data.describe()

Unnamed: 0,package name,description
count,150,150
unique,148,150
top,Rara Lake Trek,Kalinchowk is recognized as the snowy paradise...
freq,2,1


In [6]:
# Preprocess the text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    # Tokenize the text
    tokens = text.split()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming using Porter Stemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [7]:
# Apply preprocessing to the description column
data['description'] = data['description'].apply(preprocess_text)

In [8]:


# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [9]:
# Compute the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(data['description'])

In [10]:
# Compute the cosine similarity matrix
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [11]:
# Compute the cosine similarity matrix
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [12]:
# Save the cosine similarity model as a pickle file
with open('just__test_model.pkl', 'wb') as f:
    pickle.dump(cosine_similarities, f)