In [1]:
#importing libraries

import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.decomposition import TruncatedSVD
import faiss
import numpy as np


In [2]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,id,name,price(₹),Is_discontinued,manufacturer_name,type,pack_size_label,short_composition1,short_composition2
0,1,Augmentin 625 Duo Tablet,223.42,False,Glaxo SmithKline Pharmaceuticals Ltd,allopathy,strip of 10 tablets,Amoxycillin (500mg),Clavulanic Acid (125mg)
1,2,Azithral 500 Tablet,132.36,False,Alembic Pharmaceuticals Ltd,allopathy,strip of 5 tablets,Azithromycin (500mg),
2,3,Ascoril LS Syrup,118.0,False,Glenmark Pharmaceuticals Ltd,allopathy,bottle of 100 ml Syrup,Ambroxol (30mg/5ml),Levosalbutamol (1mg/5ml)
3,4,Allegra 120mg Tablet,218.81,False,Sanofi India Ltd,allopathy,strip of 10 tablets,Fexofenadine (120mg),
4,5,Avil 25 Tablet,10.96,False,Sanofi India Ltd,allopathy,strip of 15 tablets,Pheniramine (25mg),


In [3]:
df.shape

(253973, 9)

##### Process

We will take an input and use its short composition 1 and short compostion 2 , and find medicines with similar or same composition and then we will check which medicines have lesser price and recommend them that. 

In [4]:
df = df.drop("type",axis='columns')
df = df.drop("pack_size_label",axis="columns")
df = df.drop("Is_discontinued",axis="columns")

In [5]:
df.head()

Unnamed: 0,id,name,price(₹),manufacturer_name,short_composition1,short_composition2
0,1,Augmentin 625 Duo Tablet,223.42,Glaxo SmithKline Pharmaceuticals Ltd,Amoxycillin (500mg),Clavulanic Acid (125mg)
1,2,Azithral 500 Tablet,132.36,Alembic Pharmaceuticals Ltd,Azithromycin (500mg),
2,3,Ascoril LS Syrup,118.0,Glenmark Pharmaceuticals Ltd,Ambroxol (30mg/5ml),Levosalbutamol (1mg/5ml)
3,4,Allegra 120mg Tablet,218.81,Sanofi India Ltd,Fexofenadine (120mg),
4,5,Avil 25 Tablet,10.96,Sanofi India Ltd,Pheniramine (25mg),


In [6]:
df.rename(columns={"price(₹)":"prices"},inplace=True)

In [7]:
df['name'] = df['name'].str.replace(r'\s* Tablet$', '', regex=True)

In [8]:
df.head()

Unnamed: 0,id,name,prices,manufacturer_name,short_composition1,short_composition2
0,1,Augmentin 625 Duo,223.42,Glaxo SmithKline Pharmaceuticals Ltd,Amoxycillin (500mg),Clavulanic Acid (125mg)
1,2,Azithral 500,132.36,Alembic Pharmaceuticals Ltd,Azithromycin (500mg),
2,3,Ascoril LS Syrup,118.0,Glenmark Pharmaceuticals Ltd,Ambroxol (30mg/5ml),Levosalbutamol (1mg/5ml)
3,4,Allegra 120mg,218.81,Sanofi India Ltd,Fexofenadine (120mg),
4,5,Avil 25,10.96,Sanofi India Ltd,Pheniramine (25mg),


In [9]:
prices_map = {}

for index,row in tqdm(df.iterrows(),total=len(df)):
    prices_map[row["name"]] = row[["prices","manufacturer_name"]]

100%|██████████| 253973/253973 [01:10<00:00, 3607.86it/s]


In [10]:
df = df.drop("prices",axis="columns")
df = df.drop("manufacturer_name",axis="columns") 
df = df.drop("id",axis="columns")

In [11]:
## filling Nan with empty strings

df['short_composition1'].fillna("",inplace=True)
df['short_composition2'].fillna("",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['short_composition1'].fillna("",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['short_composition2'].fillna("",inplace=True)


In [12]:
df['combined_composition'] = df['short_composition1']+ " " + df['short_composition2']

In [13]:
# removing short composition in the dataframe
df = df.drop("short_composition1",axis="columns")
df = df.drop("short_composition2",axis="columns")

Unnamed: 0,name,combined_composition
0,Augmentin 625 Duo,Amoxycillin (500mg) Clavulanic Acid (125mg)
1,Azithral 500,Azithromycin (500mg)
2,Ascoril LS Syrup,Ambroxol (30mg/5ml) Levosalbutamol (1mg/5ml)
3,Allegra 120mg,Fexofenadine (120mg)
4,Avil 25,Pheniramine (25mg)


In [15]:
df = df.iloc[:250000]
df.shape

(250000, 2)

In [16]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['combined_composition'])

In [17]:
svd = TruncatedSVD(n_components=3)
tfidf_matrix = svd.fit_transform(tfidf_matrix)

In [18]:
d = tfidf_matrix.shape[1]
index = faiss.IndexFlatL2(d)
index.add(tfidf_matrix) 

In [19]:
def get_recommendations(name, k=15):
    if name not in df['name'].values:
        return f"Medicine '{name}' not found in the dataset."
    
    idx = df.index[df['name'] == name].tolist()[0]
    query_vector = tfidf_matrix[idx].reshape(1, -1).astype(np.float32)
    distances, indices = index.search(query_vector, k + 1)  
    similar_indices = indices[0][1:]
    
    recommendations = []
    for i in similar_indices:
        med_name = df['name'].iloc[i]
        price_info = prices_map.get(med_name, {'prices': 'N/A', 'manufacturer_name': 'N/A'})
        recommendations.append({
            'name': med_name,
            'prices': price_info['prices'],
            'manufacturer_name': price_info['manufacturer_name']
        })
    cheapest_alternative = ""
    cheapest_price = prices_map[name].get("prices")
    for alternative in recommendations:
        if alternative['prices'] == 'N/A':
            continue
        cheapest_price = min(cheapest_price,alternative['prices'])
        if cheapest_price == alternative['prices']:
             cheapest_alternative = alternative['name']
    return (f"cheapest_alternative: {cheapest_alternative} with price: {cheapest_price} \n\n" + str(recommendations) )


In [21]:

print(get_recommendations("Dolo 650", k=5))

cheapest_alternative: A Mol 650mg with price: 9.0 

[{'name': 'Algina 650', 'prices': 11.25, 'manufacturer_name': 'Geno Pharmaceuticals Ltd'}, {'name': 'Arden 650mg', 'prices': 30.91, 'manufacturer_name': 'Adonis Laboratories Pvt Ltd'}, {'name': 'A Mol 650mg', 'prices': 9.0, 'manufacturer_name': 'Aan Pharma Pvt Ltd'}, {'name': 'Admol 650 Tablet DT', 'prices': 22.85, 'manufacturer_name': 'Kepler Health Care'}, {'name': 'Atpera 650', 'prices': 17.0, 'manufacturer_name': 'Atlantis Formulations Pvt Ltd'}]
