In [1]:
import pandas as pd

In [2]:
# Load the data
medicines = pd.read_excel('Medicine_description.xlsx')

In [3]:
# Inspect the first few rows
medicines.head()

Unnamed: 0,Name,Reason,Description,Price,Manufacturer Name,Type
0,A CN Gel(Topical) 20gmA CN Soap 75gm,Acne,Mild to moderate acne (spots),223.42,Glaxo SmithKline Pharmaceuticals Ltd,allopathy
1,A Ret 0.05% Gel 20gmA Ret 0.1% Gel 20gmA Ret 0...,Acne,A RET 0.025% is a prescription medicine that i...,132.36,Alembic Pharmaceuticals Ltd,allopathy
2,ACGEL CL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...,118.0,Glenmark Pharmaceuticals Ltd,allopathy
3,ACGEL NANO Gel 15gm,Acne,It is used to treat acne vulgaris in people 12...,218.81,Sanofi India Ltd,allopathy
4,Acleen 1% Lotion 25ml,Acne,treat the most severe form of acne (nodular ac...,10.96,Sanofi India Ltd,allopathy


In [4]:
# Check the shape of the dataframe
print(medicines.shape)

(22479, 6)


In [5]:
# Check for missing values
print(medicines.isnull().sum())

Name                 0
Reason               0
Description          0
Price                0
Manufacturer Name    0
Type                 0
dtype: int64


In [6]:
# Drop rows with missing values
medicines.dropna(inplace=True)

In [7]:
# Check for duplicate rows
print(medicines.duplicated().sum())

0


In [8]:
medicines['Description']

0                            Mild to moderate acne (spots)
1        A RET 0.025% is a prescription medicine that i...
2        It is used to treat acne vulgaris in people 12...
3        It is used to treat acne vulgaris in people 12...
4        treat the most severe form of acne (nodular ac...
                               ...                        
22474                              used for treating warts
22475                        used to soften the skin cells
22476                                       used for scars
22477                                      used for wounds
22478    used to treat and remove raised warts (usually...
Name: Description, Length: 22479, dtype: object

In [9]:
# Tokenize the 'Description' and 'Reason' columns
medicines['Description'] = medicines['Description'].apply(lambda x: x.split())
medicines['Reason'] = medicines['Reason'].apply(lambda x: x.split())

In [10]:
# Remove spaces in the tokens
medicines['Description'] = medicines['Description'].apply(lambda x: [i.replace(" ", "") for i in x])
medicines['Reason'] = medicines['Reason'].apply(lambda x: [i.replace(" ", "") for i in x])

In [11]:
# Create a new column 'tags' by combining 'Description' and 'Reason'
medicines['tags'] = medicines['Description'] + medicines['Reason']

In [12]:
# Create a new dataframe with selected columns
new_df = medicines[['Name', 'Reason', 'tags']]

In [13]:
# Combine the tags into a single string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [14]:
# Convert the tags to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [15]:
# Stem the words in the tags
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()


In [16]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [17]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [18]:
# Vectorize the tags
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=5000)
vectors = cv.fit_transform(new_df['tags']).toarray()

In [19]:
# Calculate the cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [20]:
# Function to recommend medicines
def recommend(medicine):
    medicine_index = new_df[new_df['Name'] == medicine].index[0]
    distances = similarity[medicine_index]
    medicines_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in medicines_list:
        print(new_df.iloc[i[0]].Name)

In [21]:
# Example recommendation
recommend("Acnerex Soap 75gm")

Acne UV Gel 60gm
Acnerex Soap 75gm
Acneril 1% Gel 10gmAcneril Tablet 10Acneril 0.10% Cream 20gm
Acnezyl Gel(Topical) 10gm
Acnicin Gel 15gmAcnicin 1/1% Solution 25ml


In [22]:
recommend("Acleen 1% Lotion 25ml")

Aclene 0.10% Gel 15gm
Acnay Gel 10gm
Acnelak Clz Cream 15gm
Acnelak Z Lotion 15gm
Acnemoist Cream 60gm


In [23]:
recommend("Paracetamol 125mg Syrup 60mlParacetamol 500mg Tablet 10'S")

Geriflam 100/20mg Capsule 10'S
Glenpar 500mg Tablet 10'S
Goldpar MR 4/50mg Tablet 10'S
Gufidol 100mg Injection 1'S
Healdase P Tablet 10'S
