In [1]:
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import joblib

In [2]:
df = pd.read_csv(r"C:\Users\samyh\Desktop\Medicine_Recommendation_system\medicine.csv")

In [3]:
df.head()

Unnamed: 0,index,Drug_Name,Reason,Description
0,1,A CN Gel(Topical) 20gmA CN Soap 75gm,Acne,Mild to moderate acne (spots)
1,2,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,Acne,A RET 0.025% is a prescription medicine that i...
2,3,ACGEL CL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...
3,4,ACGEL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...
4,5,Acleen 1% Lotion 25ml,Acne,treat the most severe form of acne (nodular ac...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9720 entries, 0 to 9719
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   index        9720 non-null   int64 
 1   Drug_Name    9720 non-null   object
 2   Reason       9720 non-null   object
 3   Description  9720 non-null   object
dtypes: int64(1), object(3)
memory usage: 303.9+ KB


In [5]:
df.isnull().sum()

index          0
Drug_Name      0
Reason         0
Description    0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.Reason = df.Reason.apply(lambda x: x.split())

In [8]:
df.Description = df.Description.apply(lambda x: x.split())

In [9]:
df.head()

Unnamed: 0,index,Drug_Name,Reason,Description
0,1,A CN Gel(Topical) 20gmA CN Soap 75gm,[Acne],"[Mild, to, moderate, acne, (spots)]"
1,2,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,[Acne],"[A, RET, 0.025%, is, a, prescription, medicine..."
2,3,ACGEL CL NANO Gel 15gm,[Acne],"[It, is, used, to, treat, acne, vulgaris, in, ..."
3,4,ACGEL NANO Gel 15gm,[Acne],"[It, is, used, to, treat, acne, vulgaris, in, ..."
4,5,Acleen 1% Lotion 25ml,[Acne],"[treat, the, most, severe, form, of, acne, (no..."


In [10]:
df.Reason = df.Reason.apply(lambda x: [i.replace(" ", "") for i in x])

In [11]:
df.Description = df.Description.apply(lambda x: [i.replace(" ", "") for i in x])

In [12]:
df["Tags"] = df.Description + df.Reason 

In [13]:
df = df[["index", "Drug_Name", "Tags"]]

In [14]:
df.head()

Unnamed: 0,index,Drug_Name,Tags
0,1,A CN Gel(Topical) 20gmA CN Soap 75gm,"[Mild, to, moderate, acne, (spots), Acne]"
1,2,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,"[A, RET, 0.025%, is, a, prescription, medicine..."
2,3,ACGEL CL NANO Gel 15gm,"[It, is, used, to, treat, acne, vulgaris, in, ..."
3,4,ACGEL NANO Gel 15gm,"[It, is, used, to, treat, acne, vulgaris, in, ..."
4,5,Acleen 1% Lotion 25ml,"[treat, the, most, severe, form, of, acne, (no..."


In [15]:
df.Tags = df.Tags.apply(lambda x: " ".join(x))

In [16]:
df.Tags = df.Tags.apply(lambda x: x.lower())

In [17]:
import nltk


ps = PorterStemmer()

sentence = "Programmers program with programming languages"
words = word_tokenize(sentence)

for w in words:
    print(w, " : ", ps.stem(w))
    

Programmers  :  programm
program  :  program
with  :  with
programming  :  program
languages  :  languag


In [18]:
words

['Programmers', 'program', 'with', 'programming', 'languages']

In [19]:
ps = PorterStemmer()

In [20]:
vectorizer = CountVectorizer(stop_words='english',max_features=5000)

In [21]:
def stemming(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [22]:
df.Tags = df.Tags.apply(stemming)

In [23]:
vectors = vectorizer.fit_transform(df.Tags).toarray()

In [24]:
vectorizer.get_feature_names_out()

array(['025', '12', '16', '18', 'abdomin', 'abl', 'ach', 'acid', 'acn',
       'acne', 'acquir', 'action', 'activ', 'acut', 'acute', 'adequ',
       'adhd', 'adjunct', 'adolesc', 'adult', 'adults', 'affect', 'ag',
       'age', 'aids', 'allerg', 'allergen', 'allergi', 'allow', 'alon',
       'alzheim', 'alzheimer', 'alzheimerâ', 'amoebiasi', 'anaemia',
       'anal', 'angina', 'angl', 'ani', 'ankylos', 'anorexia', 'anoth',
       'anti', 'antioxid', 'antipsychot', 'antiretrovir', 'anxieti',
       'anxiou', 'anxious', 'apnoea', 'appear', 'appetit', 'appetite',
       'appli', 'appropri', 'area', 'arrhythmia', 'arrhythmiasi',
       'arteri', 'arthralgia', 'arthriti', 'associ', 'atherothrombot',
       'athleteâ', 'atop', 'atrial', 'attack', 'awak', 'b1', 'b2', 'b3',
       'b5', 'b6', 'babi', 'backache', 'bacteri', 'bacteria', 'balanc',
       'balanitis', 'bandag', 'becom', 'behaviour', 'beliefs', 'benefit',
       'beta', 'biliari', 'biotin', 'bite', 'blackhead', 'blackheads',
      

In [25]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(9720, 806))

In [26]:
similarity = cosine_similarity(vectors)

In [27]:
similarity

array([[1.        , 0.25197632, 0.43643578, ..., 0.        , 0.        ,
        0.        ],
       [0.25197632, 1.        , 0.25660012, ..., 0.19245009, 0.1490712 ,
        0.0860663 ],
       [0.43643578, 0.25660012, 1.        , ..., 0.11111111, 0.0860663 ,
        0.0993808 ],
       ...,
       [0.        , 0.19245009, 0.11111111, ..., 1.        , 0.77459667,
        0.2981424 ],
       [0.        , 0.1490712 , 0.0860663 , ..., 0.77459667, 1.        ,
        0.34641016],
       [0.        , 0.0860663 , 0.0993808 , ..., 0.2981424 , 0.34641016,
        1.        ]], shape=(9720, 9720))

In [28]:
( df.Drug_Name == "medicine").any()

np.False_

In [29]:
"""def recommendation(medicine):
    medicine_index = df[df["Drug_Name"] == medicine].index[0]
    distances = similarity[medicine_index]
    medicines_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in medicines_list:
        print(df.iloc[i[0]].Drug_Name)"""

'def recommendation(medicine):\n    medicine_index = df[df["Drug_Name"] == medicine].index[0]\n    distances = similarity[medicine_index]\n    medicines_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]\n\n    for i in medicines_list:\n        print(df.iloc[i[0]].Drug_Name)'

In [30]:
def recommendation(user_input):

    # Convert to lowercase for case-insensitive matching
    user_input = user_input.lower()

    # Find all drug names that contain the input word
    matched_drugs = df[df['Drug_Name'].str.contains(user_input, case=False)]

    if matched_drugs.empty:
        print("❌ No matching drug found for:", user_input)
        return

    # Take the first matched drug
    medicine_index = matched_drugs.index[0]
    selected_medicine = df.loc[medicine_index, 'Drug_Name']
    print(f"✅ Found match: {selected_medicine}\nTop 5 similar drugs:")

    # Compute similarity
    distances = similarity[medicine_index]
    medicines_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in medicines_list:
        print("- " + df.iloc[i[0]].Drug_Name)

In [35]:
recommendation("Paracetamol")

✅ Found match: Paracetamol 125mg Syrup 60mlParacetamol 500mg Tablet 10'S
Top 5 similar drugs:
- Oxypamol D Tablet 10'S
- Pacimol MF Tablet 10'S
- Painil Plus 100/500mg Tablet 10'S
- Pamagin Plus Gel 30gm
- Paracetamol 125mg Syrup 60mlParacetamol 500mg Tablet 10'S


In [32]:
# Save files
joblib.dump(df, "df_medicine.joblib")
joblib.dump(similarity, "similarity_matrix.joblib")
print("✅ Model and Data saved successfully.")

✅ Model and Data saved successfully.
