<a href="https://colab.research.google.com/github/stefanmzeidler/CS710-Homework2/blob/main/Zeidler_Programming_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs, downloads, and imports

In [1]:
!pip install gensim
!pip install nltk



In [43]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd


# Util Functions

In [3]:
#Provided by Professor He
import nltk
import json
import os
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json
import sys

def read_txt_files_from_directory(directory_path):
    file_contents = {}
    try:
        for filename in os.listdir(directory_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(directory_path, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        file_contents[filename] = file.read()
                except Exception as e:
                    print(f"An error occurred while reading {filename}: {e}")
    except Exception as e:
        print(f"An error occurred while accessing the directory: {e}")
        return {}
    return file_contents

def load_from_json(filename):
    try:
        with open(filename, 'r') as json_file:
            data = json.load(json_file)
        return data
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text)

def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = []
    for token in tokens:
        if token not in stop_words:
            filtered_sentence.append(token)
    return filtered_sentence

def stemming(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

def pre_process(text):
    text_lower = text.lower()
    tokens_no_punctuation = remove_punctuation(text_lower)
    filtered_tokens = remove_stop_words(tokens_no_punctuation)
    stemmed_tokens = stemming(filtered_tokens)
    return stemmed_tokens

# TF-IDF Vectorizer

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def tfidf_similarity(data):
  tfidf = TfidfVectorizer()
  vector_matrix = tfidf.fit_transform(data['patient'])
  sim_matrix = cosine_similarity(vector_matrix)
  top5 = np.argpartition(-sim_matrix, range(6), axis=1)[:, 1:6]
  data['top5_tfidf'] = top5.tolist()
  data['top5_tfidf'] = data['top5_tfidf'].apply(lambda x: id_list(data,x))
  print(data.head())


def id_list(data,index_list):
  id_list = []
  for i in index_list:
    id_list.append(data.at[i,'patient_uid'])
  return id_list




   patient_id patient_uid      PMID                         file_path  \
0           0   7665777-1  33492400  comm/PMC007xxxxxx/PMC7665777.xml   
1           1   7665777-2  33492400  comm/PMC007xxxxxx/PMC7665777.xml   
2           2   7665777-3  33492400  comm/PMC007xxxxxx/PMC7665777.xml   
3           3   7665777-4  33492400  comm/PMC007xxxxxx/PMC7665777.xml   
4           4   7665777-5  33492400  comm/PMC007xxxxxx/PMC7665777.xml   

                                               title  \
0  Early Physical Therapist Interventions for Pat...   
1  Early Physical Therapist Interventions for Pat...   
2  Early Physical Therapist Interventions for Pat...   
3  Early Physical Therapist Interventions for Pat...   
4  Early Physical Therapist Interventions for Pat...   

                                             patient               age gender  \
0  This 60-year-old male was hospitalized due to ...  [[60.0, 'year']]      M   
1  A 39-year-old man was hospitalized due to an i...  [[39.0, 

# Doc2Vec Vectorizer

In [58]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile

def load_dataset(fname,nrows=1000):
  data = safe_read_csv(fname,nrows)
  data['tokens'] = data['patient'].apply(lambda text: pre_process(text))
  print("Data loaded")
  return data

def safe_read_csv(fname,nrows):
  try:
    filepath = os.path.join(PROJ_DIRECTORY,fname)
    return pd.read_csv(filepath_or_buffer=filepath,nrows=nrows)
  except Exception as e:
    print(f"An error occurred while reading the file: {e}")
    return None

#Adapted from https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py
def read_corpus(data, tokens_only = False):
    documents = data['tokens']
    for i, tokens in documents.items():
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

def train_doc2vec(data):
  model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=1, epochs=40)
  train_corpus = list(read_corpus(data))
  test_corpus = list(read_corpus(data, tokens_only=True))
  model.build_vocab(train_corpus)
  print("Vocab built")
  print("Starting training")
  model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
  fname = get_tmpfile(os.path.join(PROJ_DIRECTORY,"my_doc2vec_model"))
  model.save(fname)
  print("Model saved")
  return model

def calc_doc2vec_vectors(model, data):
  print("Calcualting Doc2Vec vectors")
  data['doc2vec_vectors'] = data['tokens'].apply(lambda tokens: model.infer_vector(tokens))
  print("Vectors calculated")


def load_model(fname):
  return Doc2Vec.load(fname)






In [75]:
PROJ_DIRECTORY ="/content/drive/MyDrive/HI744_Programming_Assignment_1"
data = load_dataset("PMC-Patients.csv")
doc2vec_model = train_doc2vec(data)
calc_doc2vec_vectors(doc2vec_model,data)
calc_tfidf_vectors(data)
tfidf_similarity(data)
# PROJ_DIRECTORY = os.getcwd()
# print(data.head())
# train_corpus = list(read_corpus(data))
# test_corpus = list(read_corpus(data, tokens_only=True))
# print(train_corpus[:3])
# model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=1, epochs=40)
# model.build_vocab(train_corpus)
# model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
# print(data['tokens'][1])
# vector = model.infer_vector(data['tokens'][1])
# print(vector)





<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 233817 stored elements and shape (1000, 22682)>
  Coords	Values
  (0, 20766)	0.11143435450245165
  (0, 1018)	0.04561520420845719
  (0, 22583)	0.016681030483014603
  (0, 14971)	0.016107544895133923
  (0, 12350)	0.035949987098507566
  (0, 22296)	0.04827438002199669
  (0, 9825)	0.06286041449671415
  (0, 6840)	0.03114373070050666
  (0, 20957)	0.21383889413556914
  (0, 13100)	0.049056769114834364
  (0, 2539)	0.08488007200799244
  (0, 8626)	0.04642679869460891
  (0, 5469)	0.061125912015148276
  (0, 350)	0.04530875434266244
  (0, 22458)	0.048467692563144406
  (0, 20300)	0.03355771061218345
  (0, 14946)	0.0479379587231723
  (0, 8172)	0.04202962781042252
  (0, 6813)	0.06625278820444788
  (0, 5429)	0.10591473886133504
  (0, 2174)	0.17524650765915553
  (0, 6928)	0.11408610498601464
  (0, 22333)	0.14137716502517184
  (0, 7275)	0.08753352991225126
  (0, 18993)	0.04830621254473597
  :	:
  (999, 17858)	0.08411498025188792
  (999, 4153)	0.0