In [4]:
# !pip install pandas

In [1]:
import pandas as pd
import os

In [2]:
df = pd.read_csv(os.path.join(os.getcwd(), 'dataset', 'original_product_catalog.csv'))

In [3]:
# checking the number of rows and columns of our given data
df.shape

(1186, 5)

In [4]:
df.head(50)

Unnamed: 0,id,name,description,tags,platform
0,7320,SmartReceipts Eco friendly Digital Receipt Pri...,<p>Smart Receipt D Digital Printer a transfor...,"Smart Receipt,Digital Printer,web app,sales,re...","CSS,HTML,JavaScript,JQuery,PHP,Python"
1,7319,Ecommerce Website and Dashboard With React JS ...,"<p><span style=""color:rgb(230,126,35);""><stron...","Ecommerce Website,Dashboard,React JS,Node JS,F...","CSS,JavaScript,MongoDB,NodeJS,ReactJS,TypeScript"
2,7318,Edtech Course Selling Website Next js,<h1>Course Selling web app Next js 13 Full Sta...,"course selling website,Nextjs 14,full-stack so...","HTML,JavaScript,ReactJS"
3,7298,Read2Me Mobile application supporting blind pe...,<p><strong>Short Description:</strong> Smart r...,,
4,,,,,
5,serif;color:#000000;background-color:transpar...,serif;color:#000000;background-color:transpar...,,,
6,,,,,
7,the system automatically suggests suitable bo...,,,,
8,,,,,
9,serif;color:#000000;background-color:transpar...,,,,


In [5]:
# Checking number of Null Values in each column
df.isnull().sum()

id             33
name           37
description    40
tags           42
platform       42
dtype: int64

In [6]:
# Drop rows containing null values
df.dropna(inplace = True)

In [7]:
df.head(50)

Unnamed: 0,id,name,description,tags,platform
0,7320,SmartReceipts Eco friendly Digital Receipt Pri...,<p>Smart Receipt D Digital Printer a transfor...,"Smart Receipt,Digital Printer,web app,sales,re...","CSS,HTML,JavaScript,JQuery,PHP,Python"
1,7319,Ecommerce Website and Dashboard With React JS ...,"<p><span style=""color:rgb(230,126,35);""><stron...","Ecommerce Website,Dashboard,React JS,Node JS,F...","CSS,JavaScript,MongoDB,NodeJS,ReactJS,TypeScript"
2,7318,Edtech Course Selling Website Next js,<h1>Course Selling web app Next js 13 Full Sta...,"course selling website,Nextjs 14,full-stack so...","HTML,JavaScript,ReactJS"
45,7297,Fully customizable SwiftUI Video Player for iO...,\n\n<h1>MPlayer - Customizable Video Player fo...,"mplayer,SwiftUI,video player,customizable,cont...",Swift
46,7294,Status Saver Save and Share,"<div class=""flex-shrink-0 flex flex-col relati...","status saver,direct messaging,smart filters,lo...","kotlin,o:XML"
47,7292,10 Games For Kids Unity Source Code,<p>This bundle contains 10 complete Unity game...,"Unity games bundle,Android,iOS,PC,reskin games...","C#,Unity"
48,7291,AI Job Candidates CRM,<p>Our enhanced Flask CV Generation App is a c...,"flask app,cv generation,user-friendly,web appl...","CSS,HTML,JavaScript,Python,SQLite"
49,7290,Fruit Tile Match Unity Puzzle Game For Kids,"<div>\n<div class=""public-DraftStyleDefault-bl...","animal pair tiles,tile-matching puzzle,strateg...","C#,Unity"
50,7289,Quiz Galaxy,\n\n<h1>Quiz Game Application Source Code Prod...,"quiz game application,nextjs,react,javascript,...","CSS,HTML,JavaScript,ReactJS"
51,7288,TikPro TikTok Video Downloader Without Waterma...,"<div class=""content-card"">\n<h1>TikPro - TikTo...","tikpro video downloader without watermark,Down...","CSS,HTML,Java,JavaScript,PHP,WordPress"


In [8]:
# As we can see there are too many html tags, so we need to remove them

In [9]:
from bs4 import BeautifulSoup

# Function to remove HTML tags and newlines using BeautifulSoup
def remove_html_and_newlines(text):
    # Remove HTML tags
    clean_text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # Remove newlines and excess spaces
    return clean_text.replace("\n", "").strip()

# Apply the function to the specific column
df['description'] = df['description'].apply(remove_html_and_newlines)

In [10]:
df.head(50)

Unnamed: 0,id,name,description,tags,platform
0,7320,SmartReceipts Eco friendly Digital Receipt Pri...,Smart Receipt D Digital Printer a transformat...,"Smart Receipt,Digital Printer,web app,sales,re...","CSS,HTML,JavaScript,JQuery,PHP,Python"
1,7319,Ecommerce Website and Dashboard With React JS ...,Ecommerce Website and Dashboard With React JS ...,"Ecommerce Website,Dashboard,React JS,Node JS,F...","CSS,JavaScript,MongoDB,NodeJS,ReactJS,TypeScript"
2,7318,Edtech Course Selling Website Next js,Course Selling web app Next js 13 Full Stack ...,"course selling website,Nextjs 14,full-stack so...","HTML,JavaScript,ReactJS"
45,7297,Fully customizable SwiftUI Video Player for iO...,MPlayer - Customizable Video Player for SwiftU...,"mplayer,SwiftUI,video player,customizable,cont...",Swift
46,7294,Status Saver Save and Share,Status Saver is your go-to solution for saving...,"status saver,direct messaging,smart filters,lo...","kotlin,o:XML"
47,7292,10 Games For Kids Unity Source Code,This bundle contains 10 complete Unity games. ...,"Unity games bundle,Android,iOS,PC,reskin games...","C#,Unity"
48,7291,AI Job Candidates CRM,Our enhanced Flask CV Generation App is a comp...,"flask app,cv generation,user-friendly,web appl...","CSS,HTML,JavaScript,Python,SQLite"
49,7290,Fruit Tile Match Unity Puzzle Game For Kids,This is a complete game source code. T...,"animal pair tiles,tile-matching puzzle,strateg...","C#,Unity"
50,7289,Quiz Galaxy,Quiz Game Application Source Code Product Are...,"quiz game application,nextjs,react,javascript,...","CSS,HTML,JavaScript,ReactJS"
51,7288,TikPro TikTok Video Downloader Without Waterma...,TikPro - TikTok Video Downloader Without Water...,"tikpro video downloader without watermark,Down...","CSS,HTML,Java,JavaScript,PHP,WordPress"


In [11]:
# final shape of our dataset in terms of rows x columns
df.shape

(1144, 5)

In [12]:
df.to_csv(os.path.join(os.getcwd(), 'dataset', 'final_processed_data.csv'), index = False)

In [13]:
# Combining all columns of final data into one column to create embedding Data for LLM purpose.
# For cosine, the vectordb has been created sepretly in `cosine_recommendation_system.py` file itself

In [29]:
import pandas as pd
df2 = pd.read_csv(os.path.join(os.getcwd(), 'dataset', 'final_processed_data.csv'))

In [30]:
df2.columns

Index(['id', 'name', 'description', 'tags', 'platform'], dtype='object')

In [31]:
df2['combined_text'] =  (
                       df['name'] + ' ' + 
                       df['description'] + ' ' + 
                       df['tags'] + ' ' + 
                       df['platform'])

In [32]:
df2.head()

Unnamed: 0,id,name,description,tags,platform,combined_text
0,7320,SmartReceipts Eco friendly Digital Receipt Pri...,Smart Receipt D Digital Printer a transformat...,"Smart Receipt,Digital Printer,web app,sales,re...","CSS,HTML,JavaScript,JQuery,PHP,Python",SmartReceipts Eco friendly Digital Receipt Pri...
1,7319,Ecommerce Website and Dashboard With React JS ...,Ecommerce Website and Dashboard With React JS ...,"Ecommerce Website,Dashboard,React JS,Node JS,F...","CSS,JavaScript,MongoDB,NodeJS,ReactJS,TypeScript",Ecommerce Website and Dashboard With React JS ...
2,7318,Edtech Course Selling Website Next js,Course Selling web app Next js 13 Full Stack ...,"course selling website,Nextjs 14,full-stack so...","HTML,JavaScript,ReactJS",Edtech Course Selling Website Next js Course S...
3,7297,Fully customizable SwiftUI Video Player for iO...,MPlayer - Customizable Video Player for SwiftU...,"mplayer,SwiftUI,video player,customizable,cont...",Swift,
4,7294,Status Saver Save and Share,Status Saver is your go-to solution for saving...,"status saver,direct messaging,smart filters,lo...","kotlin,o:XML",


In [34]:
df2.to_csv(os.path.join(os.getcwd(), 'dataset', 'llm_final_processed_data.csv'), index = False)