In [1]:
#Basic Libraries
import numpy as np
import pandas as pd

#Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Text Handling Libraries
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# clustering
from sklearn.cluster import KMeans

#
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
# Function for removing NonAscii characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

# Function for converting into lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

# recommendation
def get_tfid_recommendation(title, sim):
    idx = indices[title]
    sim_scores = list(enumerate(sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]
    product_indices = [i[0] for i in sim_scores]
    idx_range = products.iloc[product_indices]
    
    recommendation = df[df['id'].isin(idx_range.values)]
    return recommendation[['name','PricePerTablet']].sort_values('PricePerTablet').head(10)

In [3]:
df = pd.read_csv('1mg.csv')
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16867 entries, 0 to 16867
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    16867 non-null  object 
 1   name                  16867 non-null  object 
 2   desc                  16862 non-null  object 
 3   activeIngredient      16867 non-null  object 
 7   manufacturer          16867 non-null  object 
 8   url                   16863 non-null  object 
 9   PricePerTablet        26 non-null     float64
dtypes: float64(1), object(9)
memory usage: 1.4+ MB


In [4]:
df_new = df.reset_index()
products = df_new['id']
indices = pd.Series(df_new.index, index=df_new['id'])
df_new[['id','desc']]

Unnamed: 0,id,desc
0,2 Dep 30mg Tablet,2 Dep 30mg Tablet works by increasing the leve...
1,Glibocer M 500mg/0.3mg Tablet,Glibocer M 500mg/0.3mg Tablet belongs to a cat...
2,Vogliplay M 500mg/0.3mg Tablet,Vogliplay M 500mg/0.3mg Tablet belongs to a ca...
3,Prandial M 0.3 Tablet,Prandial M 0.3 Tablet belongs to a category of...
4,Vogloyd M 500mg/0.3mg Tablet,Vogloyd M 500mg/0.3mg Tablet belongs to a cate...
...,...,...
16862,Zyrova C 5 Capsule,Zyrova C 5 Capsule should be taken with or wit...
16863,Zyrova F 5 Tablet,Zyrova F 5 Tablet can be taken with a meal or ...
16864,Zyrtec OD 10mg Tablet,Zyrtec OD 10mg Tablet can be taken with or wit...
16865,Zytolix P Syrup,"Give Zytolix P Syrup to your child orally, eit..."


In [5]:
df_new['soup'] = df_new['desc'] + ' ' + df_new['activeIngredient'] + ' ' + df_new['alcoholWarning']   + ' ' + df_new['breastfeedingWarning'] + ' ' + df_new['pregnancyWarning']

In [6]:
# Applying all the functions in description and storing as a cleaned_desc
df_new['soup'] = df_new['soup'].astype(str)

df_new['cleaned_desc'] = df_new['soup'].apply(_removeNonAscii)
df_new['cleaned_desc'] = df_new.cleaned_desc.apply(func = make_lower_case)
df_new['cleaned_desc'] = df_new.cleaned_desc.apply(func=remove_punctuation)
df_new['cleaned_desc'] = df_new.cleaned_desc.apply(func=remove_html)
df_new['cleaned_desc'][0]

'2 dep 30mg tablet works by increasing the level of chemical messengers serotonin and noradrenaline in the brain that have a calming effect on the brain and relax the nerves thus treating your illness it may be taken with or without food it is advised to take this medicine at a fixed time each day to maintain a consistent level in the blood if you miss any doses take it as soon as you remember do not skip any doses and finish the full course of treatment even if you feel better this medication mustn t be stopped suddenly as it may worsen your symptoms some common side effects of this medicine include nausea headache and dry mouth it even causes dizziness and sleepiness so do not drive or do anything that requires mental focus until you know how this medicine affects you however these side effects are temporary and usually resolve on their own in some time please consult your doctor if these do not subside or bother you before taking 2 dep 30mg tablet inform your doctor if you have any 

In [7]:
df_new['desc'][0]

"2 Dep 30mg Tablet works by increasing the level of chemical messengers (serotonin and noradrenaline) in the brain that have a calming effect on the brain and relax the nerves, thus treating your illness. It may be taken with or without food. It is advised to take this medicine at a fixed time each day to maintain a consistent level in the blood. If you miss any doses, take it as soon as you remember. Do not skip any doses and finish the full course of treatment even if you feel better. This medication mustn't be stopped suddenly as it may worsen your symptoms.Some common side effects of this medicine include nausea, headache, and dry mouth. It even causes dizziness and sleepiness, so do not drive or do anything that requires mental focus until you know how this medicine affects you. However, these side effects are temporary and usually resolve on their own in some time. Please consult your doctor if these do not subside or bother you.Before taking 2 Dep 30mg Tablet, inform your doctor

In [8]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_new['cleaned_desc'])

In [9]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
searchstr = 'Advog M 0.3 Plus Tablet'
res = get_tfid_recommendation(searchstr,cosine_sim)
res[['name','PricePerTablet']]

Unnamed: 0,name,PricePerTablet
8871,Medfor V 500mg/0.2mg Tablet,4.1
12077,Prandial M 0.2 Tablet,5.6
1118,Apribose M 0.2 Tablet SR,6.3
1119,Apribose M 0.3 Tablet SR,7.2
1,Glibocer M 500mg/0.3mg Tablet,7.9
373,Advog M 0.2 Tablet SR,8.8
10513,Obimet V 0.2 Tablet PR,8.8
3,Prandial M 0.3 Tablet,9.3
2,Vogliplay M 500mg/0.3mg Tablet,9.6
4,Vogloyd M 500mg/0.3mg Tablet,9.87


In [11]:
searchstr = 'Glibocer M 500mg/0.3mg Tablet'
res = get_tfid_recommendation(searchstr,cosine_sim)
res[['name','PricePerTablet']]

Unnamed: 0,name,PricePerTablet
8871,Medfor V 500mg/0.2mg Tablet,4.1
12077,Prandial M 0.2 Tablet,5.6
1118,Apribose M 0.2 Tablet SR,6.3
1119,Apribose M 0.3 Tablet SR,7.2
373,Advog M 0.2 Tablet SR,8.8
10513,Obimet V 0.2 Tablet PR,8.8
3,Prandial M 0.3 Tablet,9.3
2,Vogliplay M 500mg/0.3mg Tablet,9.6
4,Vogloyd M 500mg/0.3mg Tablet,9.87
5,Welvog MF 500mg/0.3mg Tablet,10.1


In [12]:
searchstr = 'Prasusafe 5mg Tablet'
res = get_tfid_recommendation(searchstr,cosine_sim)
res[['name','PricePerTablet']]

Unnamed: 0,name,PricePerTablet
832,Amloact 2.5mg Tablet,1.3
865,Amlotrust 2.5mg Tablet,1.3
247,Acord 5mg Tablet,2.2
838,Amlocor 5mg Tablet,2.3
866,Amlotrust 5mg Tablet,2.5
5212,Ecogrel 5mg Tablet,5.4
1858,Prasuvix 5mg Tablet,6.0
3291,Prasulet 5mg Tablet,6.2
720,Prasumax 5mg Tablet,7.5
1369,Prethromb 5mg Tablet,7.6
