In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import math
import re
import gensim
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, f1_score, precision_score, recall_score, roc_auc_score

nltk.download('stopwords')
nltk.download('wordnet')
pd.set_option('display.max_colwidth', -1)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mkumari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mkumari\AppData\Roaming\nltk_data...


# Data Pre-processing

In [None]:
data = pd.read_csv ('top_50_notes.csv',encoding='utf-8')
stop_words = set(stopwords.words('english'))


#data cleaning steps
for i in range(len(data)):

    # Converting to Lowercase
    data.loc[i,'NOTE']  = data.loc[i,'NOTE'] .lower()

    #removing all text within square bracket
    data.loc[i, 'NOTE'] = re.sub(r"\[.*?\]", "", data.loc[i,'NOTE'] , flags=re.I)
   
    #removing all square bracket
    data.loc[i, 'NOTE'] = re.sub(r"[\[\]]+", "", data.loc[i,'NOTE'] , flags=re.I)

    #Remove the new line characters
    data.loc[i,'NOTE'] = re.sub(r"\t|\n|\r", " ", data.loc[i,'NOTE'] , flags=re.I)

    #Remove punctuation
    data.loc[i,'NOTE'] = re.sub(r"[,‘@\#-:'?\.$%_!()&;+”/…*•|“]", " ", data.loc[i,'NOTE'] , flags=re.I)

    #Remove duble quotes
    data.loc[i,'NOTE'] = re.sub(r'"', " ", data.loc[i,'NOTE'] , flags=re.I)

    #Remove digits
    data.loc[i,'NOTE'] = re.sub(r"\d", "", data.loc[i,'NOTE'] )

    #Remove all single characters
    data.loc[i,'NOTE'] = re.sub(r'\s+[a-zA-Z]\s+', ' ', data.loc[i,'NOTE'] )

    #Substituting multiple spaces with single space
    data.loc[i,'NOTE']  = re.sub(r'\s+', ' ', data.loc[i,'NOTE'] , flags=re.I)
   
    #remove stopwords
    words = data.loc[i,'NOTE'].split()  #Split the text into words
    filtered_words = [word for word in words if word not in stop_words]  #Filter out stop words
    data.loc[i,'NOTE'] = " ".join(filtered_words) # Join the filtered words back into a text
   
    #Remove all single characters
    data.loc[i,'NOTE'] = re.sub(r'\s+[a-zA-Z]\s+', '', data.loc[i,'NOTE'] )
   
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = data.loc[i,'NOTE'].split()  # Split the text into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize each word
    data.loc[i,'NOTE'] = " ".join(lemmatized_words) # Join the lemmatized words back into a text
   
   

#After Pre-processing saving the file
data.to_csv('data_preporssing_05042023.csv', index=False)

# Word2Vec Word Embedding

In [None]:
# Load your DataFrame with diagnosis ID and symptoms columns
df = pd.read_csv('data_preporssing_05042023.csv',encoding='utf-8')

# Tokenize the symptoms column
df['NOTE'] = df['NOTE'].apply(lambda x: word_tokenize(x.lower()))

# Train Word2Vec model
model = Word2Vec(df['NOTE'], sg=1, min_count=10, vector_size=128, window=5, workers=4)

#Save the model
model.save('word2Vec_embeddings_05042023.bin')