In [None]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [None]:

# Download required NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:

# Step 1: Define the text preprocessing function
def preprocess_text(text):
    """
    Preprocess the input text: convert to lowercase, remove special characters, 
    tokenize, remove stopwords, and lemmatize the tokens.
    """
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Return the processed text
    return ' '.join(tokens)

In [None]:
# Step 2: Load the healthcare dataset (Example CSV file)
# Replace 'healthcare_data.csv' with the actual file path
data = pd.read_csv('healthcare_data.csv')

In [None]:
# Step 3: Check the columns and inspect the dataset
print(data.columns)  # Verify the columns
print(data.head())   # Inspect the first few rows

In [None]:

# Step 4: Apply text preprocessing on the 'question' column (ensure it exists)
if 'question' in data.columns:
    data['processed_question'] = data['question'].apply(preprocess_text)
    print("Preprocessing complete!")
else:
    print("Error: 'question' column not found in the dataset.")

In [None]:
# Step 5: Check if the 'processed_question' column is created
print(data.columns)
print(data['processed_question'].head())  # Display the first few rows of processed text


In [None]:
# Step 6: Prepare features and target for the model
X = data['processed_question']  # Features (processed questions)
y = data['answer']  # Target (answers)


In [None]:
# Step 7: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Step 8: Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:

# Step 9: Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [None]:
# Step 10: Predict on the test data
y_pred = model.predict(X_test_tfidf)

In [None]:

# Step 11: Evaluate the model's performance
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Step 12: Save the trained model and vectorizer for later use
import joblib
joblib.dump(model, 'healthcare_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


In [None]:


# Optional: Define a function to predict answers for new questions
def predict_answer(new_question):
    """
    Function to predict an answer for a new question using the trained model and vectorizer.
    """
    # Preprocess the question
    processed_question = preprocess_text(new_question)
    
    # Convert the processed question to TF-IDF features
    tfidf_vector = vectorizer.transform([processed_question])
    
    # Predict the answer using the model
    answer = model.predict(tfidf_vector)[0]
    
    return answer


In [None]:

# Example: Test the prediction function with a new question
new_question = "What are the symptoms of a cold?"
predicted_answer = predict_answer(new_question)
print(f"Predicted Answer: {predicted_answer}")
