First_Aid_Knock

In [1]:
import pandas as pd
df = pd.read_csv('data/medicine_details.csv')

In [2]:
print(df.head()) # See the first few rows
print(df.info()) # Check data types and non-null counts
print(df.shape)  # Get number of rows and columns
print(df.columns) # See column names

              Medicine Name  \
0   Avastin 400mg Injection   
1  Augmentin 625 Duo Tablet   
2       Azithral 500 Tablet   
3          Ascoril LS Syrup   
4         Aciloc 150 Tablet   

                                         Composition  \
0                                Bevacizumab (400mg)   
1    Amoxycillin  (500mg) +  Clavulanic Acid (125mg)   
2                               Azithromycin (500mg)   
3  Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...   
4                                 Ranitidine (150mg)   

                                                Uses  \
0   Cancer of colon and rectum Non-small cell lun...   
1                  Treatment of Bacterial infections   
2                  Treatment of Bacterial infections   
3                      Treatment of Cough with mucus   
4  Treatment of Gastroesophageal reflux disease (...   

                                        Side_effects  \
0  Rectal bleeding Taste change Headache Noseblee...   
1  Vomiting Nausea Diarrhea

In [3]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('[^a-z0-9_]', '', regex=True)
print(df.columns) # Verify cleaned names

Index(['medicine_name', 'composition', 'uses', 'side_effects', 'image_url',
       'manufacturer', 'excellent_review_', 'average_review_', 'poor_review_'],
      dtype='object')


In [4]:
print(df.isnull().sum())

medicine_name        0
composition          0
uses                 0
side_effects         0
image_url            0
manufacturer         0
excellent_review_    0
average_review_      0
poor_review_         0
dtype: int64


In [5]:
for col in ['uses', 'side_effects', 'composition', 'medicine_name', 'category']: # Adjust columns as per your CSV
    if col in df.columns:
        df[col] = df[col].fillna('') # Fill with empty string

In [6]:
print(f"Number of duplicate rows before: {df.duplicated().sum()}")

Number of duplicate rows before: 84


In [7]:
df.drop_duplicates(inplace=True)
print(f"Number of duplicate rows after: {df.duplicated().sum()}")
print(f"New shape after dropping duplicates: {df.shape}")

Number of duplicate rows after: 0
New shape after dropping duplicates: (11741, 9)


In [8]:
# For all relevant text columns
for col in ['uses', 'side_effects', 'composition', 'medicine_name', 'category']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().str.strip()

In [9]:
if 'category' in df.columns:
    print("\nTop 10 Categories:")
    print(df['category'].value_counts().head(10))

# For 'uses' column, you might want to analyze individual words
# This requires text preprocessing first (tokenization, stop word removal)

In [10]:
if 'uses' in df.columns:
    df['uses_length'] = df['uses'].apply(len)
    print("\nUses Text Length Description:")
    print(df['uses_length'].describe())


Uses Text Length Description:
count    11741.000000
mean        44.989268
std         35.974751
min          4.000000
25%         24.000000
50%         35.000000
75%         53.000000
max        483.000000
Name: uses_length, dtype: float64


In [12]:
# Example for 'uses' (requires NLTK installation: pip install nltk)
import nltk
from nltk.corpus import stopwords
from collections import Counter

# Download stopwords if not already downloaded
# nltk.download('stopwords')
# nltk.download('punkt') # For word_tokenize

stop_words = set(stopwords.words('english'))

def analyze_text_column(df_col):
    all_words = []
    for text in df_col.astype(str): # Ensure text is string
        tokens = nltk.word_tokenize(text.lower())
        filtered_words = [word for word in tokens if word.isalpha() and word not in stop_words]
        all_words.extend(filtered_words)
    return Counter(all_words).most_common(20) # Top 20 most common words

if 'uses' in df.columns:
    print("\nMost common words in 'uses' column (excluding stopwords):")
    print(analyze_text_column(df['uses']))

if 'side_effects' in df.columns:
    print("\nMost common words in 'side_effects' column (excluding stopwords):")
    print(analyze_text_column(df['side_effects']))


Most common words in 'uses' column (excluding stopwords):
[('treatment', 8286), ('infections', 1769), ('high', 1579), ('pain', 1410), ('blood', 1336), ('hypertension', 1320), ('pressure', 1236), ('disease', 1159), ('heart', 1129), ('diabetes', 1019), ('prevention', 1018), ('bacterial', 944), ('type', 923), ('mellitus', 920), ('skin', 911), ('reflux', 826), ('fungal', 754), ('attack', 664), ('due', 466), ('allergic', 451)]

Most common words in 'side_effects' column (excluding stopwords):
[('pain', 6346), ('nausea', 6170), ('headache', 5336), ('diarrhea', 4520), ('blood', 4145), ('dizziness', 4035), ('vomiting', 3473), ('increased', 2853), ('stomach', 2353), ('abdominal', 2216), ('skin', 2214), ('level', 2028), ('itching', 1994), ('site', 1792), ('rash', 1780), ('sleepiness', 1776), ('constipation', 1683), ('irritation', 1627), ('infection', 1578), ('redness', 1563)]
