# Show the document pairwise similarity between the different input data sets and print the most common document terms

In [7]:
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re

def clean_text(raw_text, remove_stopwords = False, output_format ="string"):
    """
    Input:
            raw_text: raw text from input
            remove_stopwords: a boolean variable to indicate whether to remove stop words
            output_format: if "string", return a cleaned string
                           if "list", a list of words extracted from cleaned string.
    Output:
            Cleaned string or list.
    """

    # Remove HTML markup
    text = BeautifulSoup(raw_text, "lxml")

    # Keep only characters
    text = re.sub("[^a-zA-Z]", " ", text.get_text())

    # Split words and store to list
    text = text.lower().split()

    if remove_stopwords:

        # Use set as it has O(1) lookup time
        stops = set(stopwords.words("english"))
        words = [w for w in text if w not in stops]

    else:
        words = text

    # Return a cleaned string or list
    if output_format == "string":
        return " ".join(words)

    elif output_format == "list":
        return words




In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_files = ['ucb_comments.csv',
              '../training_data/hope_tweets_test.txt',
              '../training_data/depressed_tweets_test.txt',
              '../training_data/fact_tweets_test.txt']

documents = [open(f).read() for f in text_files]
documents = [clean_text(d) for d in documents]
tfidf = TfidfVectorizer().fit_transform(documents)
# no need to normalize, since Vectorizer will return normalized tf-idf
pairwise_similarity = tfidf * tfidf.T
print pairwise_similarity


  (0, 1)	0.757040959382
  (0, 2)	0.842257819759
  (0, 3)	0.685120659518
  (0, 0)	1.0
  (1, 0)	0.757040959382
  (1, 2)	0.854170354095
  (1, 3)	0.821569594602
  (1, 1)	1.0
  (2, 0)	0.842257819759
  (2, 1)	0.854170354095
  (2, 3)	0.78159738371
  (2, 2)	1.0
  (3, 0)	0.685120659518
  (3, 1)	0.821569594602
  (3, 2)	0.78159738371
  (3, 3)	1.0


In [9]:

for idx,doc in enumerate(documents):
    wordlist = nltk.FreqDist(nltk.word_tokenize(clean_text(doc, True)))
    print "\nmost common document terms/occurrences for ", text_files[idx]
    for item in wordlist.most_common(50):
        print item[0], item[1]


most common document terms/occurrences for  ucb_comments.csv
epilepsy 868
seizures 865
seizure 627
one 468
get 444
years 425
know 370
like 352
sleep 348
since 315
time 312
people 311
would 283
life 253
never 249
day 243
go 216
help 215
always 214
son 209
still 209
also 203
work 203
old 185
even 183
meds 183
take 179
going 179
brain 176
right 171
mal 170
good 170
partial 166
free 164
memory 163
feel 162
daughter 161
drive 159
grand 158
remember 156
back 154
need 153
many 152
family 151
could 150
thank 149
year 147
much 146
really 145
god 143

most common document terms/occurrences for  ../training_data/hope_tweets_test.txt
day 58
hope 51
god 30
people 28
tomorrow 26
try 25
today 23
end 23
good 23
life 23
one 22
give 21
sometimes 21
lord 20
voice 20
going 19
never 18
always 16
us 16
even 15
new 15
love 15
must 14
saying 14
get 14
courage 14
see 14
quiet 14
mary 14
things 14
like 14
radmacher 14
anne 14
live 13
thing 13
better 13
need 13
make 13
let 12
turn 12
die 11
world 11
everything 