In [32]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

In [25]:
fake = Faker()
random.seed(42)
num_rows = 1000
medical_drugs_dict = {
    "Crocin": {"Manufacturer": "GlaxoSmithKline Pharmaceuticals Limited", "Generic Drug Name": "Paracetamol","Route of Administration": "Oral", "Storage Conditions": "normal", "Prescription Required": "no", "Therapeutic Class":'Pain relievers' ,  "Dosage Form" :"Tablet" , "Adverse Reactions": 'Allergic Reactions'},
    "Combiflam": {"Manufacturer": "Sanofi India Limited", "Generic Drug Name": "Ibuprofen + Paracetamol", "Route of Administration": "Oral", "Storage Conditions": "normal", "Prescription Required": "no", "Therapeutic Class":'Pain relievers' ,  "Dosage Form" :"Tablet" , "Adverse Reactions": 'Allergic Reactions'},
    "Dolo-650": {"Manufacturer": "Micro Labs Limited", "Generic Drug Name": "Paracetamol", "Route of Administration": "Oral", "Storage Conditions": "normal", "Prescription Required": "yes", "Therapeutic Class":'Pain relievers' ,  "Dosage Form" :"Tablet" , "Adverse Reactions": 'Allergic Reactions'},
    "Cetrizine": {"Manufacturer": "Sun Pharmaceutical Industries Ltd.", "Generic Drug Name": "Cetirizine", "Route of Administration": "Topical", "Storage Conditions": "cold", "Prescription Required": "yes", "Therapeutic Class":'Antibiotics' ,  "Dosage Form" :"Capsule" , "Adverse Reactions": 'Drug Interactions'},
    "Pantocid": {"Manufacturer": "Sun Pharmaceutical Industries Ltd.", "Generic Drug Name": "Pantoprazole", "Route of Administration": "Topical", "Storage Conditions": "cold", "Prescription Required": "no", "Therapeutic Class":'Antibiotics' ,  "Dosage Form" :"Capsule" , "Adverse Reactions": 'Drug Interactions'},
    "Ciplox": {"Manufacturer": "Cipla Ltd.", "Generic Drug Name": "Ciprofloxacin", "Route of Administration": "Injectable", "Storage Conditions": "cold", "Prescription Required": "yes", "Therapeutic Class":'Antidepressants' ,  "Dosage Form" :"Capsule" , "Adverse Reactions": 'Organ Toxicity'},
    "Atorva": {"Manufacturer": "Sun Pharmaceutical Industries Ltd.", "Generic Drug Name": "Atorvastatin", "Route of Administration": "Injectable", "Storage Conditions": "dry", "Prescription Required": "no", "Therapeutic Class":'Antidepressants' ,  "Dosage Form" :"Tablet" , "Adverse Reactions": 'Organ Toxicity'},
    "Azee": {"Manufacturer": "Cipla Ltd.", "Generic Drug Name": "Azithromycin", "Route of Administration": "Injectable", "Storage Conditions": "cold", "Prescription Required": "yes", "Therapeutic Class":'Pain relievers' ,  "Dosage Form" :"Liquid" , "Adverse Reactions": 'Drowsiness'},
    "Omez": {"Manufacturer": "Dr. Reddy's Laboratories Ltd.", "Generic Drug Name": "Omeprazole", "Route of Administration": "Topical", "Storage Conditions": "dry", "Prescription Required": "no", "Therapeutic Class":'Blood pressure medications' ,  "Dosage Form" :"Liquid" , "Adverse Reactions": 'Drowsiness'},
    "Amoxyclav": {"Manufacturer": "GlaxoSmithKline Pharmaceuticals Limited", "Generic Drug Name": "Amoxicillin + Clavulanic Acid", "Route of Administration": "Oral", "Storage Conditions": "cold", "Prescription Required": "yes", "Therapeutic Class":'Blood pressure medications' ,  "Dosage Form" :"Liquid" , "Adverse Reactions": 'Drowsiness'}
}
list_of_drugs = list(medical_drugs_dict.keys())
list_cities = ['Mumbai', 'Delhi', 'Bangalore', 'Kolkata', 'Chennai','Hyderabad','Pune','Ahmedabad','Surat','Jaipur']

In [35]:
# Generate synthetic data for the pharmaceutical dataset
data = {
    "Drug Name": [random.choice(list_of_drugs) for _ in range(num_rows)],
    "Drug ID": [fake.uuid4() for _ in range(num_rows)],
    "Strength": [random.randint(1, 1000) for _ in range(num_rows)],
    "Pack Size": [random.randint(1, 50) for _ in range(num_rows)],
    "Price": [round(random.uniform(1.0, 500.0), 2) for _ in range(num_rows)],
    "Expiry Date": [fake.date_between(start_date='today', end_date='+5y').strftime('%Y-%m-%d') for _ in range(num_rows)],
    "Batch Number": [fake.uuid4() for _ in range(num_rows)],
    "Manufacture Date": [fake.date_between(start_date='-5y', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)],
    "Country of Origin": [random.choice(['United States', 'Germany' , 'Switzerland', 'India']) for _ in range(num_rows)],
    "Drug Interactions": [fake.sentence() for _ in range(num_rows)],
    "Patient Age Group": [random.choice(["Pediatric", "Adult", "Elderly"]) for _ in range(num_rows)],
    "Patient Gender": [random.choice(["Male", "Female"]) for _ in range(num_rows)],
    "Patient Weight": [random.randint(50, 100) for _ in range(num_rows)],
    "Geographic Region": [random.choice(list_cities) for i in range(num_rows)],
    "Sales Volume": [random.randint(1, 100) for _ in range(num_rows)]
}
# Create DataFrame
df = pd.DataFrame(data)

In [36]:
for i in df['Drug Name']:
    df['Manufacturer'] = df['Drug Name'].map(lambda key: medical_drugs_dict[key]['Manufacturer'])
    df['Generic Drug Name'] = df['Drug Name'].map(lambda key: medical_drugs_dict[key]['Generic Drug Name'])
    df['Route of Adminstration'] = df['Drug Name'].map(lambda key: medical_drugs_dict[key]['Route of Administration'])
    df['Storage Conditions'] = df['Drug Name'].map(lambda key: medical_drugs_dict[key]['Storage Conditions'])
    df['Prescription Required'] = df['Drug Name'].map(lambda key: medical_drugs_dict[key]['Prescription Required'])
    df['Therapeutic Class'] = df['Drug Name'].map(lambda key: medical_drugs_dict[key]["Therapeutic Class"])
    df[ "Dosage Form"] = df['Drug Name'].map(lambda key : medical_drugs_dict[key]['Dosage Form'])
    df["Adverse Reactions"] = df['Drug Name'].map(lambda key : medical_drugs_dict[key]["Adverse Reactions"])

In [37]:
df.head()

Unnamed: 0,Drug Name,Drug ID,Strength,Pack Size,Price,Expiry Date,Batch Number,Manufacture Date,Country of Origin,Drug Interactions,...,Geographic Region,Sales Volume,Manufacturer,Generic Drug Name,Route of Adminstration,Storage Conditions,Prescription Required,Therapeutic Class,Dosage Form,Adverse Reactions
0,Azee,53117687-8e75-4a5c-9ee6-214dc99d7501,780,44,25.9,2027-10-31,26077521-5fff-4787-90a0-742fae339f13,2022-05-29,Switzerland,Which appear experience of value material.,...,Jaipur,70,Cipla Ltd.,Azithromycin,Injectable,cold,yes,Pain relievers,Liquid,Drowsiness
1,Dolo-650,cd034bd8-eb76-4591-918c-c6d7f3ba4f1e,440,3,69.72,2026-01-04,483e6b86-d4a1-42fa-a682-f0b6f9af3c9d,2021-09-12,India,International ok treatment open treatment eye ...,...,Chennai,7,Micro Labs Limited,Paracetamol,Oral,normal,yes,Pain relievers,Tablet,Allergic Reactions
2,Azee,72c5f808-c24c-43bc-a37a-e20119e58659,254,40,439.48,2025-04-16,026167e1-c266-4304-8fd7-15bcbabcc3f7,2019-03-16,Germany,Wrong well nice may raise road.,...,Kolkata,78,Cipla Ltd.,Azithromycin,Injectable,cold,yes,Pain relievers,Liquid,Drowsiness
3,Pantocid,54707665-e2da-496e-bda7-f2a59785ecbd,633,42,392.85,2024-05-20,de39c145-7832-4955-8387-36166b866b4a,2019-11-16,Germany,Look add yet story.,...,Ahmedabad,26,Sun Pharmaceutical Industries Ltd.,Pantoprazole,Topical,cold,no,Antibiotics,Capsule,Drug Interactions
4,Dolo-650,595268d8-33ef-4a18-817b-b9deb05e0d9b,157,4,251.49,2026-06-05,aed34a8e-0c09-42f5-b7b0-186df12a6a6f,2023-09-03,United States,Similar open trip amount always also.,...,Mumbai,31,Micro Labs Limited,Paracetamol,Oral,normal,yes,Pain relievers,Tablet,Allergic Reactions


In [38]:
## inserting Outliers
# Number of outliers to introduce
num_outliers = 5

# Generate random indices for outliers
outlier_indices = np.random.choice(df.index, num_outliers, replace=False)

# Introduce outliers
for idx in outlier_indices:
    df.at[idx, 'Strength'] = np.random.randint(1500, 2000)  # Example outlier value for 'Strength'
    df.at[idx, 'Price'] = np.random.randint(700, 1500)     # Example outlier value for 'Price'


In [39]:
# Save DataFrame to CSV file
df.to_csv('pharmaceutical_data.csv', index=False)