In [None]:
import joblib

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

# Load data
data = pd.read_csv("tt.csv")
data = data.dropna()

# Features and target
X = data[['Problem Summary', 'Problem Category', 'Psychological Category']]
y_icd10 = data['ICD-10']
y_treatment = data['Treatment Provided']

# Train-test split
X_train, X_test, y_icd10_train, y_icd10_test, y_treatment_train, y_treatment_test = train_test_split(
    X, y_icd10, y_treatment, test_size=0.2, random_state=42)

# One-hot encoding for categorical features
encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train[['Problem Category', 'Psychological Category']])

# TF-IDF vectorization for Problem Summary
tfidf_vectorizer = TfidfVectorizer()
X_train_summary = tfidf_vectorizer.fit_transform(X_train['Problem Summary'])

# Combine encoded features with TF-IDF vectors
X_train_combined = hstack((X_train_encoded, X_train_summary))

# Train ICD-10 classifier
rf_icd10 = RandomForestClassifier()
rf_icd10.fit(X_train_combined, y_icd10_train)

# Filter out unknown categories from the test set
X_test_filtered = X_test[X_test['Problem Category'].isin(X_train['Problem Category'].unique()) &
                         X_test['Psychological Category'].isin(X_train['Psychological Category'].unique())]

# Transform the filtered test set
X_test_encoded = encoder.transform(X_test_filtered[['Problem Category', 'Psychological Category']])
X_test_summary = tfidf_vectorizer.transform(X_test_filtered['Problem Summary'])
X_test_combined = hstack((X_test_encoded, X_test_summary))

# Make ICD-10 predictions
icd10_predictions = rf_icd10.predict(X_test_combined)


# Calculate ICD-10 accuracy
icd10_accuracy = accuracy_score(y_icd10_test.loc[X_test_filtered.index], icd10_predictions)

print("ICD-10 Accuracy:", icd10_accuracy)


ICD-10 Accuracy: 0.8888888888888888


In [None]:
X_test_encoded

<18x20 sparse matrix of type '<class 'numpy.float64'>'
	with 36 stored elements in Compressed Sparse Row format>

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack

class MedicalPredictor:

    def __init__(self, X_train, y_icd10_train, y_treatment_train):
        self.encoder = OneHotEncoder()
        self.tfidf_vectorizer = TfidfVectorizer()

        # One-hot encoding for categorical features
        X_train_encoded = self.encoder.fit_transform(X_train[['Problem Category', 'Psychological Category']])

        # TF-IDF vectorization for Problem Summary
        X_train_summary = self.tfidf_vectorizer.fit_transform(X_train['Problem Summary'])

        # Combine encoded features with TF-IDF vectors
        self.X_train_combined = hstack((X_train_encoded, X_train_summary))

        # Train ICD-10 classifier
        self.rf_icd10 = RandomForestClassifier()
        self.rf_icd10.fit(self.X_train_combined, y_icd10_train)

        # Train Treatment classifier
        self.rf_treatment = RandomForestClassifier()
        self.rf_treatment.fit(self.X_train_combined, y_treatment_train)

    def predict_input(self, input_tuple):
        input_data = pd.DataFrame([input_tuple], columns=['Problem Summary', 'Problem Category', 'Psychological Category'])

        # Transform input data
        input_encoded = self.encoder.transform(input_data[['Problem Category', 'Psychological Category']])
        input_summary = self.tfidf_vectorizer.transform(input_data['Problem Summary'])
        input_combined = hstack((input_encoded, input_summary))

        # Make predictions
        icd10_prediction = self.rf_icd10.predict(input_combined)[0]
        treatment_prediction = self.rf_treatment.predict(input_combined)[0]

        return icd10_prediction, treatment_prediction
    def calculate_jaccard_score(self, true_icd10_labels, true_treatment_labels):
        # Predict on the training data
        icd10_predictions = self.rf_icd10.predict(self.X_train_combined)
        treatment_predictions = self.rf_treatment.predict(self.X_train_combined)

        # Calculate Jaccard scores
        icd10_jaccard = jaccard_score(true_icd10_labels, icd10_predictions, average='macro')
        treatment_jaccard = jaccard_score(true_treatment_labels, treatment_predictions, average='macro')

        return icd10_jaccard, treatment_jaccard

# Example usage
data = pd.read_csv("tt.csv").dropna()
X = data[['Problem Summary', 'Problem Category', 'Psychological Category']]
y_icd10 = data['ICD-10']
y_treatment = data['Treatment Provided']
X_train, y_icd10_train, y_treatment_train = X, y_icd10, y_treatment  # Using entire dataset for demonstration

predictor = MedicalPredictor(X_train, y_icd10_train, y_treatment_train)

input_tuple = ('Post-Traumatic Stress', 'Psychosis', 'Schizophrenia Spectrum')
icd10_pred, treatment_pred = predictor.predict_input(input_tuple)

print("Predicted ICD-10:", icd10_pred)
print("Predicted Treatment Provided:", treatment_pred)

# Calculate and print Jaccard scores
icd10_jaccard, treatment_jaccard = predictor.calculate_jaccard_score(y_icd10_train, y_treatment_train)
print("Jaccard score for ICD-10 predictions:", icd10_jaccard)
print("Jaccard score for Treatment predictions:", treatment_jaccard)

# Example usage
data = pd.read_csv("tt.csv").dropna()
X = data[['Problem Summary', 'Problem Category', 'Psychological Category']]
y_icd10 = data['ICD-10']
y_treatment = data['Treatment Provided']
X_train, y_icd10_train, y_treatment_train = X, y_icd10, y_treatment  # Using entire dataset for demonstration

predictor = MedicalPredictor(X_train, y_icd10_train, y_treatment_train)

input_tuple = ('Post-Traumatic Stress', 'Psychosis', 'Schizophrenia Spectrum')
icd10_pred, treatment_pred = predictor.predict_input(input_tuple)

print("Predicted ICD-10:", icd10_pred)
print("Predicted Treatment Provided:", treatment_pred)


Predicted ICD-10: F43.10
Predicted Treatment Provided: Exposure Therapy
Jaccard score for ICD-10 predictions: 0.9296875
Jaccard score for Treatment predictions: 0.602432083553365
Predicted ICD-10: F43.10
Predicted Treatment Provided: Exposure Therapy


In [None]:
joblib.dump(predictor, 'regression_model.joblib')

['regression_model.joblib']

In [None]:
# newwwwwwwwww

In [12]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
import joblib
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split

class MedicalPredictor:

    def __init__(self, X_train, y_icd10_train, y_treatment_train, train_flag=True, model_file="medical_predictor_model.pkl"):
        self.encoder = OneHotEncoder()
        self.tfidf_vectorizer = TfidfVectorizer()
        self.model_file = model_file

        if train_flag:
            # Combine training and test data for fitting OneHotEncoder
            X_combined = pd.concat([X_train, X_test], ignore_index=True)

            # One-hot encoding for categorical features
            X_combined_encoded = self.encoder.fit_transform(X_combined[['Problem Category', 'Psychological Category']])

            # TF-IDF vectorization for Problem Summary
            X_train_summary = self.tfidf_vectorizer.fit_transform(X_train['Problem Summary'])

            # Combine encoded features with TF-IDF vectors
            X_train_combined = hstack((X_combined_encoded[:len(X_train)], X_train_summary))

            # Train ICD-10 classifier
            self.rf_icd10 = RandomForestClassifier()
            self.rf_icd10.fit(X_train_combined, y_icd10_train)

            # Train Treatment classifier
            self.rf_treatment = RandomForestClassifier()
            self.rf_treatment.fit(X_train_combined, y_treatment_train)

            # Save the model
            self.save_model()

        else:
            # Load the model
            self.load_model()

    def save_model(self):
        joblib.dump((self.encoder, self.tfidf_vectorizer, self.rf_icd10, self.rf_treatment), self.model_file)

    def load_model(self):
        self.encoder, self.tfidf_vectorizer, self.rf_icd10, self.rf_treatment = joblib.load(self.model_file)

    def predict_input(self, input_tuple):
        input_data = pd.DataFrame([input_tuple], columns=['Problem Summary', 'Problem Category', 'Psychological Category'])

        # Transform input data
        input_encoded = self.encoder.transform(input_data[['Problem Category', 'Psychological Category']])
        input_summary = self.tfidf_vectorizer.transform(input_data['Problem Summary'])
        input_combined = hstack((input_encoded, input_summary))

        # Make predictions
        icd10_prediction = self.rf_icd10.predict(input_combined)[0]
        treatment_prediction = self.rf_treatment.predict(input_combined)[0]

        return icd10_prediction, treatment_prediction

    def calculate_jaccard_score(self, X_test, y_icd10_test, y_treatment_test):
        # Transform test data
        X_test_encoded = self.encoder.transform(X_test[['Problem Category', 'Psychological Category']])
        X_test_summary = self.tfidf_vectorizer.transform(X_test['Problem Summary'])
        X_test_combined = hstack((X_test_encoded, X_test_summary))

        # Predict on the test data
        icd10_predictions = self.rf_icd10.predict(X_test_combined)
        treatment_predictions = self.rf_treatment.predict(X_test_combined)

        # Calculate Jaccard scores
        icd10_jaccard = jaccard_score(y_icd10_test, icd10_predictions, average='macro')
        treatment_jaccard = jaccard_score(y_treatment_test, treatment_predictions, average='macro')

        return icd10_jaccard, treatment_jaccard

    def retrain_model(self, X_new, y_icd10_new, y_treatment_new):
        # One-hot encoding for categorical features
        X_new_encoded = self.encoder.transform(X_new[['Problem Category', 'Psychological Category']])

        # TF-IDF vectorization for Problem Summary
        X_new_summary = self.tfidf_vectorizer.transform(X_new['Problem Summary'])

        # Combine encoded features with TF-IDF vectors
        X_new_combined = hstack((X_new_encoded, X_new_summary))

        # Retrain ICD-10 classifier
        self.rf_icd10.fit(X_new_combined, y_icd10_new)

        # Retrain Treatment classifier
        self.rf_treatment.fit(X_new_combined, y_treatment_new)

        # Save the retrained model
        self.save_model()

# Example usage
data = pd.read_csv("tt.csv").dropna()

X = data[['Problem Summary', 'Problem Category', 'Psychological Category']]
y_icd10 = data['ICD-10']
y_treatment = data['Treatment Provided']

# Split data into training and testing sets
X_train, X_test, y_icd10_train, y_icd10_test, y_treatment_train, y_treatment_test = train_test_split(
    X, y_icd10, y_treatment, test_size=0.2, random_state=42)

# Train the model
predictor = MedicalPredictor(X_train, y_icd10_train, y_treatment_train, train_flag=True)

# Evaluate the model
icd10_jaccard, treatment_jaccard = predictor.calculate_jaccard_score(X_test, y_icd10_test, y_treatment_test)
print("ICD-10 Jaccard Score on Test Data:", icd10_jaccard)
print("Treatment Jaccard Score on Test Data:", treatment_jaccard)


ICD-10 Jaccard Score on Test Data: 0.6296296296296295
Treatment Jaccard Score on Test Data: 0.14305555555555557


In [None]:
X = data[['Problem Summary', 'Problem Category', 'Psychological Category']]
y_icd10 = data['ICD-10']
y_treatment = data['Treatment Provided']
X_train, y_icd10_train, y_treatment_train = X, y_icd10, y_treatment  # Using entire dataset for demonstration

# Train the model if it doesn't exist, otherwise load it from file
predictor = MedicalPredictor(X_train, y_icd10_train, y_treatment_train, train_flag=True)

In [None]:
input_tuple = ('Post-Traumatic Stress', 'Psychosis', 'Schizophrenia Spectrum')
icd10_pred, treatment_pred = predictor.predict_input(input_tuple)

In [None]:
print("Predicted ICD10 : " + icd10_pred)
print("Treatment : " +treatment_pred)

Predicted ICD10 : F43.10
Treatment : Antipsychotic Medication


In [None]:
true_icd10_labels = y_icd10_train
true_treatment_labels = y_treatment_train

# Calculate Jaccard scores
icd10_jaccard, treatment_jaccard = predictor.calculate_jaccard_score(true_icd10_labels, true_treatment_labels)

# Print the Jaccard scores
print("Jaccard score for ICD-10 codes:", icd10_jaccard)
print("Jaccard score for treatments:", treatment_jaccard)

Jaccard score for ICD-10 codes: 0.9296875
Jaccard score for treatments: 0.5991902834008097


In [None]:
datanew = pd.read_csv("tt.csv").dropna()
X = datanew[['Problem Summary', 'Problem Category', 'Psychological Category']]
y_icd10 = data['ICD-10']
y_treatment = data['Treatment Provided']
X_train, y_icd10_train, y_treatment_train = X, y_icd10, y_treatment
X_new, y_icd10_new, y_treatment_new = X_train, y_icd10_train, y_treatment_train
predictor.retrain_model(X_new, y_icd10_new, y_treatment_new)

In [11]:
import pandas as pd
import re
import nltk
import pickle
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define function for preprocessing text
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words('english'))  # Get stopwords
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    preprocessed_text = ' '.join(tokens)  # Join tokens back into a single string
    return preprocessed_text


# Define function for training or loading model
def train_or_load_model(training_data):
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    X_train = tfidf_vectorizer.fit_transform(training_data['Problem Description'])
    y_train = training_data[['Problem Summary', 'Problem Category', 'Psychological Category', 'ICD-10']]

    classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    multi_output_classifier = MultiOutputClassifier(classifier)

    pipeline = Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('clf', multi_output_classifier)
    ])

    pipeline.fit(training_data['Problem Description'], y_train)
    joblib.dump(pipeline, 'model.pkl')
    return pipeline


# Define function to check for new files and update model
def check_for_new_files_and_update_model():
    # latest_file = max([os.path.join(folder_path, f) for f in os.listdir(folder_path)], key=os.path.getctime)
    # print("Latest file:", latest_file)
    training_data = pd.read_csv('tt.csv')
    training_data=training_data.dropna()

    # if os.path.exists('/Users/shubhamsaxena/PycharmProjects/noideaProject/ModelState/model.pkl'):
    # with open('model.pkl', 'rb') as model_file:
    #     pipeline = pickle.load(model_file)
    # else:
    #     pipeline = train_or_load_model(training_data)

    pipeline = train_or_load_model(training_data)

    # with open('model.pkl', 'wb') as model_file:
    #     pickle.dump(pipeline, model_file)

    return pipeline


# Define function to make predictions on incoming dataset
def make_predictions_on_incoming_dataset(incoming_dataset, pipeline):
    incoming_data = pd.DataFrame([incoming_dataset])
    incoming_data['Problem Description'] = incoming_data['problem_description'].apply(preprocess_text)
    incoming_data.drop(columns=['problem_description'], inplace=True)

    predictions = pipeline.predict(incoming_data['Problem Description'])
    return predictions


# Example of incoming dataset
incoming_dataset = {"age":26, "gender":"Female", "problem_description":"Sarah a 26-year-old woman experiences intense fear and anxiety in social situations making it difficult for her to interact with others. She worries excessively about being judged or humiliated which leads her to avoid social gatherings and events. Sarah's social anxiety has impacted her ability to form meaningful connections and pursue opportunities in her personal and professional life. Despite her desire to overcome her fears Sarah feels trapped by her anxiety and unable to break free from its grip."}

# Specify folder path where new files will be added
# folder_path = '/Users/shubhamsaxena/Desktop/NewDat'

# Check for new files and update model
pipeline = check_for_new_files_and_update_model()

# Make predictions on incoming dataset
predictions = make_predictions_on_incoming_dataset(incoming_dataset, pipeline)
print("Predictions on Incoming Dataset:")
print(predictions)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predictions on Incoming Dataset:
[['Obsessive-Compulsive Disorder' 'Anxiety Disorders' 'Anxiety' 'F45.21']]
