# Identifying Similar Posts

This workbook captures cosine-distance based similarity analysis conducted on the captured posts. It includes a 'recommender' system to identify most-similar texts.

In [3]:
import pickle

In [4]:
import re

In [5]:
import string

In [6]:
import plotly.express as px

In [7]:
# Import other packages for examples
import pandas as pd
import numpy as np
import scipy.sparse as ss

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [8]:
from sklearn.metrics import pairwise_distances

In [9]:
# Make better use of Jupyter Notebook cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

pd.set_option('display.max_rows', 660)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.options.display.float_format = '{:,.10f}'.format

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF
from sklearn.metrics import pairwise_distances

## Load the data

In [2]:
df_zipped_post3 = pickle.load(open("/Users/arcarter/Git_Repos/project_05/df_zipped_post3.p", "rb"))

df_zipped_post3.shape

(7824, 50)

## Vectorize

In [11]:
from sklearn.feature_extraction import text 

In [12]:
additional_stop_words = ['rt','vets','vet','veteran','veterans']
additional_stop_words2 = ['rt','vets','vet','veteran','veterans','Veterans_Benefits_Admin','veterans_benefits_admin', 'veterans_health_admin',
                          'Veterans_Health_Admin','department_of_va','veteransbenefitsadmin departmentofva', 
                          'veteransbenefitsadmin departmentofva veterans_health_adminadminadminadminadmin', 
                          'departmentofva veterans_health_adminadminadminadminadmin', 'veteransbenefitsadmin', 
                          'veterans_health_adminadminadminadminadmin', 'departmentofva','va','cnn', 'foxnews','msnbc','nbc','abc','fox', 'tapatalk',
                          'cruiser','year','ago','old','just','know','said','dont','don','t','did','like','think','told', 'sent', 'using', 'wrote', 'pm', 'feb', 'good_morning_evening']

stop_wordz = text.ENGLISH_STOP_WORDS.union(additional_stop_words)
stop_wordz2 = text.ENGLISH_STOP_WORDS.union(additional_stop_words2)

In [15]:
vectorizer = CountVectorizer(#max_features=20000,
                             stop_words=stop_wordz2, #token_pattern="\\b[a-z][a-z]+\\b",
                             binary=False)

In [16]:
doc_word = vectorizer.fit_transform(df_zipped_post3.post_text)
words = list(np.asarray(vectorizer.get_feature_names()))

## Topic Modeling with NMF

In [17]:
nmf_model = NMF(10)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(7824, 10)

In [18]:
topic_word = nmf_model.components_
topic_word.shape

(10, 17977)

In [19]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['time', 'years', 'im', 'got', 'days', 'good'],
 ['right', 'disability', 'service_connected', 'static', 'pain', 'extremity'],
 ['appeal', 'board_veterans_appeals', 'decision', 'appeals', 'remand', 'case'],
 ['claim', 'file', 'claims', 'letter', 'filed', 'denied'],
 ['evidence', 'service', 'cue', 'diagnosis', 'denied', 'current'],
 ['decision', 'board', 'court', 'vetapp', 'appeal', 'smc'],
 ['use', 'nsaid', 'ckd', 'service', 'opinion', 'nsaids'],
 ['evaluation', 'percent', 'assigned', 'effective', 'cfr', 'impairment'],
 ['rating', 'disability', 'vso', 'rule', 'condition', 'based'],
 ['iu', 'exam', 'dbq', 'rater', 'examiner', 'based']]

In [20]:
doc_topic_nmf = pd.DataFrame(doc_topic.round(5),
                             #index = ex_label,
                             columns = ["component_1","component_2","component_3", 
                                        "component_4","component_5","component_6",
                                        "component_7","component_8","component_9",
                                        "component_10"])

doc_topic_nmf

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10
0,0.0000000000,0.0000000000,0.0029200000,0.0000000000,0.0297400000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000
1,0.0000000000,0.0000000000,0.0675900000,0.1228300000,0.0000000000,0.0051500000,0.0000000000,0.0085500000,0.0005800000,0.0000000000
2,0.0000000000,0.0000000000,0.3257600000,0.0944700000,0.0018400000,0.0054400000,0.0000000000,0.0293500000,0.0000000000,0.0000000000
3,0.0120500000,0.0670000000,0.0861000000,0.0000000000,0.2081700000,0.1666800000,0.0000000000,0.0000000000,0.0000000000,0.0198500000
4,0.0000000000,0.0000000000,0.0005400000,0.0014300000,0.0650600000,0.0001500000,0.0307300000,0.0000000000,0.0000000000,0.0000000000
...,...,...,...,...,...,...,...,...,...,...
7819,0.0533800000,0.0000000000,0.0373400000,0.0354000000,0.0295900000,0.0000000000,0.0006000000,0.1513600000,0.0225600000,0.0263400000
7820,0.0015300000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0002000000,0.0000000000
7821,0.1360500000,0.0196800000,0.0360600000,0.0228500000,0.0776100000,0.0000000000,0.0238000000,0.0353200000,0.1524300000,0.0239400000
7822,0.0275900000,0.0000000000,0.0033200000,0.0105800000,0.0000000000,0.0000000000,0.0013000000,0.0000000000,0.0000000000,0.0201800000


## Calculate Distance & Identify Similar Posts

In [21]:
pairwise_distances(doc_topic[4].reshape(1,-1),doc_topic,metric='cosine')

array([[0.09959362, 0.97895237, 0.98244169, ..., 0.64582417, 0.97797935,
        0.99502321]])

In [22]:
pairwise_distances(doc_topic[4].reshape(1,-1),doc_topic,metric='cosine')

array([[0.09959362, 0.97895237, 0.98244169, ..., 0.64582417, 0.97797935,
        0.99502321]])

In [23]:
distance_measure = pairwise_distances(doc_topic[4].reshape(1,-1),doc_topic,metric='cosine').argsort()

In [24]:
post_ids = list(distance_measure[0][0:5])

In [34]:
similar_posts_list = df_zipped_post3.loc[post_ids]

In [38]:
similar_posts_list.head(1)

Unnamed: 0,post_id_container,post_text,post_time,post_author,original_post_id_container,topic_title,post_hour_min,post_date,dow,week,week_alt,month,year2,Topic_Count,Topic_Name_or_Other,combined_time,post_text_original,service_connected,cp_exam,effective_date,service_connection,medical_record,coronaviruspandemic_vaccine,mental_health,good_luck,active_duty,sleep_apnea,rating_decision,hearing_loss,decision_letter,aid_attendance,medical_evidence,supplemental_claim,new_claim,caregiver_program,file_claim,gon_na,Covid_Docs,Disability_and_Appeals_Docs,Mental_Health_Docs,Aid_and_Attendance_Docs,Sleep_Docs,City_Docs,Waiting_Docs,College_Kids_Docs,Cold_Weather_Docs,Cancer_and_Disease_Docs,top_topic,Top Topic,Top Topic - Covid
4,post_content1418699,did he review your military medical_records and post service medical_records,"7:43 PM - Sep 04, 2019","[""POST_AUTHOR"", ""EKco22""]",post_content1511221,Legacy Appeals; Actual number of days from Appeal Certification,19:43:00,2019-09-04,2,2019-09-02/2019-09-08,2019-w36,2019-09,2019,306,Legacy Appeals; Actual number of days from Appeal Certification,2019-09-04 19:43:00,Did he review your military medical records and post service medical records?,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1e-06,9.58201e-05,3.65771e-05,1.08224e-05,1e-06,0.0009353297,0.0005996073,0.0006265097,0.000572627,0.0009576043,9,Cancer and Disease,Other


#### ...Create function

In [36]:
def similar_posts(df, id):
    distance_measure = pairwise_distances(doc_topic[4].reshape(1,-1),doc_topic,metric='cosine').argsort()
    post_ids = list(distance_measure[0][0:5])
    similar_posts_list = df.loc[post_ids]
    return similar_posts_list

In [37]:
similar_posts(df_zipped_post3,4)

Unnamed: 0,post_id_container,post_text,post_time,post_author,original_post_id_container,topic_title,post_hour_min,post_date,dow,week,week_alt,month,year2,Topic_Count,Topic_Name_or_Other,combined_time,post_text_original,service_connected,cp_exam,effective_date,service_connection,medical_record,coronaviruspandemic_vaccine,mental_health,good_luck,active_duty,sleep_apnea,rating_decision,hearing_loss,decision_letter,aid_attendance,medical_evidence,supplemental_claim,new_claim,caregiver_program,file_claim,gon_na,Covid_Docs,Disability_and_Appeals_Docs,Mental_Health_Docs,Aid_and_Attendance_Docs,Sleep_Docs,City_Docs,Waiting_Docs,College_Kids_Docs,Cold_Weather_Docs,Cancer_and_Disease_Docs,top_topic,Top Topic,Top Topic - Covid
4,post_content1418699,did he review your military medical_records and post service medical_records,"7:43 PM - Sep 04, 2019","[""POST_AUTHOR"", ""EKco22""]",post_content1511221,Legacy Appeals; Actual number of days from Appeal Certification,19:43:00,2019-09-04,2,2019-09-02/2019-09-08,2019-w36,2019-09,2019,306,Legacy Appeals; Actual number of days from Appeal Certification,2019-09-04 19:43:00,Did he review your military medical records and post service medical records?,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1e-06,9.58201e-05,3.65771e-05,1.08224e-05,1e-06,0.0009353297,0.0005996073,0.0006265097,0.000572627,0.0009576043,9,Cancer and Disease,Other
2215,post_content1503310,do your service medical_records show treatment for athletes feet how long have you been out from active military service,"4:14 PM - Jan 14, 2021","[""POST_AUTHOR"", ""USMCgruntrvn""]",post_content1503149,VA Eval Form for Athlete's Foot,16:14:00,2021-01-14,3,2021-01-11/2021-01-17,2021-w2,2021-01,2021,6,Other,2021-01-14 16:14:00,Do your Service Medical Records show treatment for Athletes Feet? How long have you been out from active military service?,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1e-06,4.1268e-06,8.16745e-05,1.08235e-05,1e-06,0.0009254017,0.0051637921,0.00062908,0.0005697683,0.0637882711,9,Cancer and Disease,Other
6558,post_content1510397,there are many things that can cause this it should be diagnosed before discarding the idea that it could be service related,"6:48 AM - Mar 08, 2021","[""POST_AUTHOR"", ""Charlie8d""]",post_content1510334,neuropathy of feet,06:48:00,2021-03-08,0,2021-03-08/2021-03-14,2021-w10,2021-03,2021,20,Other,2021-03-08 06:48:00,There are many things that can cause this & it should be diagnosed before discarding the idea that it could be service related.,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1e-06,1e-06,0.0009729693,1.08355e-05,1e-06,0.0009380784,0.0024098002,5.1086e-06,0.000572532,0.0074294817,9,Cancer and Disease,Other
6599,post_content1510455,i forgot to welcome you to the forum so welcome and thank you for your service,"2:09 PM - Mar 08, 2021","[""POST_AUTHOR"", ""JNoxon""]",post_content1510449,VA Disability rating prior to decision being made,14:09:00,2021-03-08,0,2021-03-08/2021-03-14,2021-w10,2021-03,2021,8,Other,2021-03-08 14:09:00,I forgot to welcome you to the forum! So welcome and thank you for your service!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1e-06,1e-06,1e-06,1.10822e-05,1e-06,0.0009363238,0.0021889677,0.0006263334,0.0005724206,0.0102655947,9,Cancer and Disease,Other
5894,post_content1509556,there is no specific requirement that you have a dd however there must be some official verification of your service as brovet requested please provide some details on your service and your disabling conditions that you believe to be due to that service cruiser,"7:48 AM - Mar 02, 2021","[""POST_AUTHOR"", ""Cruiser""]",post_content1509524,Denial of VA claim,07:48:00,2021-03-02,1,2021-03-01/2021-03-07,2021-w9,2021-03,2021,11,Other,2021-03-02 07:48:00,"There is no specific requirement that you have a DD 214; however, there must be some official verification of your service. As BROVET requested, please provide some details on your service and your disabling conditions that you believe to be due to that service. Cruiser",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1e-06,0.999999,5.8334e-06,1.07234e-05,5.6988e-05,0.0009354844,0.0006009014,0.0006283567,0.0005729379,0.0010089558,1,Disability and Appeals,Other


## Create Function for Testing New, Unseen Text
...starting with test setup

In [42]:
t = ['diabetes']

In [43]:
vt = vectorizer.transform(t)

In [44]:
tt = nmf_model.transform(vt)

In [45]:
pairwise_distances(tt,doc_topic,metric='cosine').argsort()

array([[5491, 1600, 1184, ..., 1245, 3762, 1705]])

In [46]:
pairwise_distances(tt.reshape(1,-1),doc_topic,metric='cosine').argsort()

array([[5491, 1600, 1184, ..., 1245, 3762, 1705]])

In [47]:
distance_measure = pairwise_distances(tt.reshape(1,-1),doc_topic,metric='cosine').argsort()

In [48]:
post_ids = list(distance_measure[0][0:20])

In [49]:
similar_posts_list = df_zipped_post3.loc[post_ids]

In [50]:
similar_posts_list[['topic_title','post_text_original']].head(1)

Unnamed: 0,topic_title,post_text_original
5491,Hypothryoidism and Agent Orange,"livelsberger wrote: 4:21 PM - Feb 21Years ago at least 35, I put in a claim for hypothyroidism caused by agent orange it was denied. Now since it is on the list do I file as a new claim or ask for a DRO review? Thanks Gents. My father in-law just got service connected for agent orange, but I don't see hypothyroidism on the list? I'm looking at this list. *Edited by EKco22 to remove law firm website*"


#### ...and now the function

In [39]:
def get_similar(df, user_text):
    t = [user_text]
    vt = vectorizer.transform(t)
    tt = nmf_model.transform(vt)
    distance_measure = pairwise_distances(tt.reshape(1,-1),doc_topic,metric='cosine').argsort()
    post_ids = list(distance_measure[0][0:20])
    similar_posts = df.loc[post_ids]
    return similar_posts[['topic_title','post_text_original']]

In [51]:
get_similar(df_zipped_post3,'diabetes')[:4]

Unnamed: 0,topic_title,post_text_original
5491,Hypothryoidism and Agent Orange,"livelsberger wrote: 4:21 PM - Feb 21Years ago at least 35, I put in a claim for hypothyroidism caused by agent orange it was denied. Now since it is on the list do I file as a new claim or ask for a DRO review? Thanks Gents. My father in-law just got service connected for agent orange, but I don't see hypothyroidism on the list? I'm looking at this list. *Edited by EKco22 to remove law firm website*"
1600,Quesstions Re Filing Diabetes Clailm,I got message from my doctor and am diagnosed as a diabetic. I have to have an appointment with him to discuss mediation or trying to control things by diet and exercise. After we decide on a course of treatment I will file a claim.
1184,Chernobyl Illnesss From West Germany,"FarmerInTheDell wrote: Aug 21, 2020Hi Miguel, I lost the case. They said that Hypothyroidism is a common illness and not enough evidence to suggest Chernobyl. I disagree. I think it’s what triggered my hypothyroidism and my autoimmune disease! You could go on Google , and start with Chernobyl radiation poisoning cloud over Germany and start your search the Chernobyl Affects ,Chernobyl areas affected by radiation ,and Military vehicles and aircraft exhaust exposure, Chernobyl acciddent and its consequences, public health (fuels petroleum, oil,lubricant how the Chernobyl cloud affected cognitive abilities in Germany, a map of how far was your company from Chernobyl, a map of the border and your company ,Google the distance of the town to Chernobyl how many miles, Chernobyl Hypothyroidism , Chernobyl Autoimmune disease, Chernobyl radiation zone area the was affected , ▪︎▪︎the Chernobyl accident - an epidemiological perspective this the part the talks about THYRIOD CANCER FORM CHERNOBYL. and watch also YouTube on Chernobyl radiation. then re- file your claim and submit again. Do give up all on this website will give you a feed back .remember Chernobyl radius zone over Germany and chernobyl dates also all are important."
1012,Legacy Appeals; Actual number of days from Appeal Certification,"Just my $.02 I've been in the VA system for a decade. Had an incredibly complicated TBI claim. I've been a patient in the VA Poly Trauma unit for 8 years. Every one of my Dr's had notes detailing the tests, treatment, and diagnosis. These items were the basis of a successful claim. .eventually, from the VBA. I never had a Nexus letter written by one of my Dr's. I'm not exactly sure what additional value they would have provided when everything was in their notes. . .again. My $.02. ."
