In [1]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 
import os

In [2]:
df = pd.read_csv("/workspaces/codespaces-jupyter/data/drugs_side_effects_drugs_com.csv")
df.sample(5)

Unnamed: 0,drug_name,medical_condition,side_effects,generic_name,drug_classes,brand_names,activity,rx_otc,pregnancy_category,csa,alcohol,related_drugs,medical_condition_description,rating,no_of_reviews,drug_link,medical_condition_url
1375,neomycin,Diarrhea,hives ; difficult breathing; swelling of your ...,neomycin,Aminoglycosides,Neo-Fradin,4%,Rx,D,N,,erythromycin: https://www.drugs.com/erythromyc...,Diarrhea Other names: Frequent bowel movements...,1.0,1.0,https://www.drugs.com/mtm/neomycin.html,https://www.drugs.com/condition/diarrhea.html
2779,penicillamine,Rheumatoid Arthritis,"hives , rash; swollen glands; difficult breath...",penicillamine,Antirheumatics,"Cuprimine, Depen, D-Penamine",0%,Rx,D,N,,,Rheumatoid Arthritis Other names: Arthritis; A...,,,https://www.drugs.com/mtm/penicillamine.html,https://www.drugs.com/condition/rheumatoid-art...
883,Biotussin DAC,Colds & Flu,hives ; difficult breathing; swelling of your ...,"codeine, guaifenesin, and pseudoephedrine",Upper respiratory combinations,"Ambifed CD, Cheratussin DAC, Guaifen DAC, Lort...",1%,Rx/OTC,C,M,X,Benadryl: https://www.drugs.com/benadryl.html ...,Cold Symptoms Other names: Cold; Common Cold; ...,,,https://www.drugs.com/mtm/biotussin-dac.html,https://www.drugs.com/condition/cold-symptoms....
153,Brevoxyl Acne Wash Kit,Acne,Brevoxyl Acne Wash Kit can cause a rare but se...,benzoyl peroxide topical,Topical acne agents,"Acne-Clear, Benzac AC, BenzePrO, Benziq, Clear...",0%,Rx,C,N,,,Acne Other names: Acne Vulgaris; Blackheads; B...,,,https://www.drugs.com/mtm/brevoxyl-acne-wash-k...,https://www.drugs.com/condition/acne.html
1574,calcium carbonate,GERD (Heartburn),hives ; difficulty breathing; swelling of your...,calcium carbonate,"Antacids, Minerals and electrolytes","Calci-Chew, Nephro Calci, Dicarbosil, Equilet,...",2%,OTC,N,N,,,GERD Other names: Acid reflux; Esophageal Refl...,4.0,6.0,https://www.drugs.com/mtm/calcium-carbonate.html,https://www.drugs.com/condition/gastroesophage...


In [3]:
# Identifying duplicates
duplicates = df[df.duplicated()]

# Displaying the duplicate rows
print("Duplicate Rows: ", duplicates)

Duplicate Rows:  Empty DataFrame
Columns: [drug_name, medical_condition, side_effects, generic_name, drug_classes, brand_names, activity, rx_otc, pregnancy_category, csa, alcohol, related_drugs, medical_condition_description, rating, no_of_reviews, drug_link, medical_condition_url]
Index: []


In [4]:
# Data Cleaning¶
# Drop unwanted columns(not relevant for our model)
df = df.drop(['no_of_reviews','drug_link','medical_condition_url','activity','rating'], axis = 1)
df.shape

(2931, 12)

In [5]:
# Changing NaN to unknown
df['side_effects'] = df['side_effects'].fillna(value='Unknown')
# df['side_effects'].isnull().sum()
df['generic_name'] = df['generic_name'].fillna(value='Unknown')
# df['generic_name'].isnull().sum()
df['drug_classes'] = df['drug_classes'].fillna(value='Unknown')
# df['drug_classes'].isnull().sum()
df['brand_names'] = df['brand_names'].fillna(value='Unknown')
# df['brand_names'].isnull().sum()
df['pregnancy_category'] = df['pregnancy_category'].fillna(value='Unknown')
# df['pregnancy_category'].isnull().sum()
df['alcohol'] = df['alcohol'].fillna(value='Unknown')

# df['alcohol'].isnull().sum()
# df['medical_condition_description'] = df['medical_condition_description'].fillna(
#     value='Unknown')
# df['medical_condition_description'].isnull().sum()

df['rx_otc'] = df['rx_otc'].fillna(value='Unknown')

df['related_drugs'] = df['related_drugs'].fillna(value='Unknown')

df.isnull().sum()

drug_name                        0
medical_condition                0
side_effects                     0
generic_name                     0
drug_classes                     0
brand_names                      0
rx_otc                           0
pregnancy_category               0
csa                              0
alcohol                          0
related_drugs                    0
medical_condition_description    0
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

categorical_features = ['drug_classes', 'rx_otc',
                        'pregnancy_category', 'csa', 'alcohol', 'brand_names', 'side_effects']
target_column = 'medical_condition'

# Select features and target
X = df[categorical_features]
y = df[target_column]

# Define the transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create the pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100))
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.7955706984667802
Classification Report:
                       precision    recall  f1-score   support

                ADHD       1.00      0.77      0.87        13
            AIDS/HIV       0.90      0.95      0.93        20
                Acne       0.87      0.98      0.92        48
           Allergies       1.00      0.14      0.25         7
         Alzheimer's       0.88      0.88      0.88         8
              Angina       0.94      0.84      0.89        19
             Anxiety       0.60      0.25      0.35        12
              Asthma       0.33      0.30      0.32        10
    Bipolar Disorder       0.33      0.40      0.36         5
          Bronchitis       0.60      0.75      0.67         8
                COPD       0.75      0.67      0.71         9
              Cancer       1.00      1.00      1.00         3
         Cholesterol       1.00      0.50      0.67        10
         Colds & Flu       0.81      1.00      0.89        38
        Constipa

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = {
    # Number of trees in the forest
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    # Minimum number of samples required to split an internal node
    'classifier__min_samples_split': [2, 5, 10],
    # Minimum number of samples required at each leaf node
    'classifier__min_samples_leaf': [1, 2, 4]
}
skf = StratifiedKFold(n_splits=3)
grid_search = GridSearchCV(model, param_grid, cv=skf, verbose=1, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


Best parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Accuracy: 0.8006814310051107
Classification Report:
                       precision    recall  f1-score   support

                ADHD       1.00      0.85      0.92        13
            AIDS/HIV       0.90      0.95      0.93        20
                Acne       0.90      0.98      0.94        48
           Allergies       1.00      0.14      0.25         7
         Alzheimer's       0.88      0.88      0.88         8
              Angina       0.93      0.74      0.82        19
             Anxiety       0.50      0.25      0.33        12
              Asthma       0.33      0.30      0.32        10
    Bipolar Disorder       0.43      0.60      0.50         5
          Bronchitis       0.60      0.75      0.67         8
                COPD       0.67      0.44      0.53         9
              Cancer       0.75      1.00      0.

In [8]:
# # Frequency encoding
# df.info()
# print("Beginning frequency encoding\n")
# # List of columns to perform frequency encoding on
# columns_to_encode = ['drug_name', 'generic_name', 'brand_names', 'related_drugs']

# # Perform frequency encoding for each column
# for column in columns_to_encode:
#     frequency_map = df[column].value_counts(normalize=True)
#     df[column + '_frequency_encoded'] = df[column].map(frequency_map)
# print("\n")
# df.info()

In [9]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report

# df['related_drugs'] = df['related_drugs'].fillna(value='Unknown') #  Changing the NaN values to unknown
# # Assuming 'X' contains the features and 'y' contains the target variable
# # X = df.columns
# X = df.drop(columns=['drug_name', 'generic_name', 'brand_names', 'related_drugs', 'medical_condition', 'medical_condition_description', 'side_effects'], axis=1)

# y = df['drug_name']

# # Step 2: Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Step 3: Model training
# rf_classifier = RandomForestClassifier(random_state=42)
# rf_classifier.fit(X_train, y_train)

# # Step 4: Model evaluation
# y_pred = rf_classifier.predict(X_test)
# print(classification_report(y_test, y_pred))

# # Step 6: Feature importance
# feature_importances = rf_classifier.feature_importances_
# print("Feature Importance:")
# for feature, importance in zip(X.columns, feature_importances):
#     print(f"{feature}: {importance}")
