## STEPS
Prepare csv datasets [x]<br>
- Consolidate tweets that belong to the same hospital to the same csv file [x] 
- It should end up with 20 institutional csv files and 20 individual csv files [x] 

Prepare the text data and preliminary analysis [x]<br>
- Import csv datasets to strings [x]<br>
- Tokenize the strings [x]<br>
- Filter out common words [x]<br>
- Filter out stop words [x]<br>
- Produce distribution information [x]<br>
- Wrap it to a funcion [x]<br>

Generate a raw text file for graphing word cloud by R [x]<br>
- Write a function to produce a raw text file for each hospital [x]<br>
- Combine all raw files [x]<br>

## Requirements

In [62]:
import pandas as pd
import re
import glob
import nltk
from nltk import word_tokenize
import csv

## Prepare csv datasets

In [63]:
# Institutional tweet csv files by hospital
inst_file_list = ['MC_offi_1_', 'CC_offi_2_', 'JHop_offi_3_', 'MGH_offi_4_', 'UCSF_offi_5_', 'UnivMich_offi_6_',
                 'NYPH_offi_8_', 'Standford_offi_9_', 'UnivPenn_offi_10_', 'BJH_offi_12_', 'NMH_offi_13_',
                 'UPMC_offi_14_', 'UnivColo_offi_15_', 'TJ_Univ_offi_16_', 'Duke_offi_17_', 'MntSinai_offi_18_',
                 'NYUlang_offi_19_', 'MCP_offi_20_']
for inst_file in inst_file_list:
    inst_files = glob.glob('./twitter_data/' + inst_file + '*.csv')
    i = re.search(r'\d+', inst_file)
    csv_name = 'inst_tweets_' + str(i.group(0)) + '.csv'
    new_dataframe = pd.DataFrame()
    for file in inst_files:
        data = pd.read_csv(file)
        new_dataframe = pd.concat([new_dataframe, data['tweet']], axis = 0)
        new_dataframe.to_csv(path_or_buf = './processed_data/' + csv_name)

In [64]:
# Individual tweet csv files by hospital
indi_file_list = ['MC_indi_1_', 'CC_indi_2_', 'JHop_indi_3_', 'MGH_indi_4_', 'UCSF_indi_5_', 'UnivMich_indi_6_',
                  'UCLA_indi_7_', 'Standford_indi_9_', 'UnivPenn_indi_10_', 'CSM_indi_11', 'BJH_indi_12_', 
                  'NMH_indi_13_', 'UPMC_indi_14_', 'UnivColo_indi_15_', 'TJ_Univ_indi_16_', 'Duke_indi_17_', 
                  'MntSinai_indi_18_', 'NYUlang_indi_19_', 'MCP_indi_20_']
for indi_file in indi_file_list:
    indi_files = glob.glob('./twitter_data/' + indi_file + '*.csv')
    i = re.search(r'\d+', indi_file)
    csv_name = 'indi_tweets_' + str(i.group(0)) + '.csv'
    new_dataframe = pd.DataFrame()
    for file in indi_files:
        data = pd.read_csv(file)
        new_dataframe = pd.concat([new_dataframe, data['tweet']], axis = 0)
        new_dataframe.to_csv(path_or_buf = './processed_data/' + csv_name)

## Prepare the text data and preliminary analysis

In [65]:
'''
This function takes a csv file (a string) and return the top 10 most used words from the file (a dataframe).
'''
def word_dist_in_pct(tweet_filename):
    file = open('./processed_data/' + tweet_filename + '.csv')
    raw = file.read()
    tokens = word_tokenize(raw) # Tokenize the text data
    text = nltk.Text(tokens) # Make it to text class using nltk package
    words = [w.lower() for w in text] # Normalize all the words to lowercase
    wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()] # Call all correct-spelt vocabulary 
    stop_words = set(nltk.corpus.stopwords.words('english')) # Call all the stop words
    standard_word = [w for w in words if w in wordlist and w not in stop_words] 
    # Screen out incorrect-spelt words and stop words
    fdist = nltk.FreqDist(standard_word) # Call distribution table usng nltk package
    dist_table = pd.DataFrame.from_dict(fdist, orient='index', columns = ['freq']) # Convert the table to a dataframe
    dist_table['pct'] = (dist_table['freq']/len(standard_word)) * 100 # Calculate percentage
    top_10_dist_in_pct = dist_table.sort_values(by = ['pct'], ascending = False)
    return top_10_dist_in_pct[0:10]

#### 01 Mayo Clinic, Rochester, Minnesota Institutional Twitter Account

In [66]:
MC_offi_1 = word_dist_in_pct('inst_tweets_1')
MC_offi_1

Unnamed: 0,freq,pct
clinic,1267,2.396671
learn,497,0.940131
cancer,496,0.938239
new,454,0.858791
health,418,0.790693
via,408,0.771777
use,391,0.73962
care,335,0.63369
help,316,0.597749
news,285,0.539109


#### 02 Cleveland Clinic Institutional Twitter Account

In [67]:
CC_offi_2 = word_dist_in_pct('inst_tweets_2')
CC_offi_2

Unnamed: 0,freq,pct
new,532,1.562225
cancer,314,0.922065
heart,253,0.742938
research,253,0.742938
learn,250,0.734128
help,240,0.704763
health,238,0.69889
study,237,0.695953
disease,234,0.687144
may,203,0.596112


#### 03 Johns Hopkins Hospital, Baltimore Institutional Twitter Account

In [68]:
JHop_offi_3 = word_dist_in_pct('inst_tweets_3')
JHop_offi_3

Unnamed: 0,freq,pct
cancer,145,0.973612
learn,128,0.859464
research,117,0.785604
new,117,0.785604
health,113,0.758746
us,96,0.644598
may,95,0.637884
great,85,0.570738
care,74,0.496878
risk,74,0.496878


#### 04 Massachusetts General Hospital Institutional Twitter Account

In [69]:
MGH_offi_4 = word_dist_in_pct('inst_tweets_4')
MGH_offi_4

Unnamed: 0,freq,pct
new,1111,1.120637
research,1070,1.079282
health,905,0.912851
cancer,780,0.786766
today,688,0.693968
learn,649,0.65463
care,645,0.650595
help,641,0.64656
us,626,0.63143
center,622,0.627396


#### 05 UCSF Medical Center, San Francisco Institutional Twitter Account

In [70]:
UCSF_offi_5 = word_dist_in_pct('inst_tweets_5')
UCSF_offi_5

Unnamed: 0,freq,pct
via,154,2.662978
study,89,1.538994
new,88,1.521702
health,64,1.106692
help,50,0.864603
care,46,0.795435
could,46,0.795435
cancer,46,0.795435
people,45,0.778143
research,44,0.760851


#### 06 University of Michigan Hospitals and Health Centers, Ann Arbor Institutional Twitter Account

In [71]:
UnivMich_offi_6 = word_dist_in_pct('inst_tweets_6')
UnivMich_offi_6

Unnamed: 0,freq,pct
new,233,1.560094
research,156,1.044526
medicine,149,0.997657
michigan,127,0.850352
cancer,107,0.716438
help,92,0.616003
care,91,0.609307
health,86,0.575829
us,85,0.569133
one,84,0.562437


#### 08 New York-Presbyterian Hospital, New York Institutional Twitter Account

In [72]:
NYPH_offi_8 = word_dist_in_pct('inst_tweets_8')
NYPH_offi_8

Unnamed: 0,freq,pct
care,113,1.387865
learn,103,1.265045
new,95,1.166789
us,89,1.093098
today,88,1.080816
hospital,80,0.98256
cancer,74,0.908868
join,73,0.896586
health,72,0.884304
see,69,0.847458


#### 09 Stanford Health Care-Stanford Hospital, Stanford, California Institutional Twitter Account

In [73]:
Standford_offi_9 = word_dist_in_pct('inst_tweets_9')
Standford_offi_9

Unnamed: 0,freq,pct
health,457,1.419916
new,444,1.379525
care,257,0.798509
study,230,0.714619
medicine,204,0.633836
help,197,0.612086
could,191,0.593444
medical,188,0.584123
hospital,188,0.584123
via,176,0.546839


#### 10 Hospitals of the University of Pennsylvania-Penn Presbyterian, Philadelphia Institutional Twitter Account

In [74]:
UnivPenn_offi_10 = word_dist_in_pct('inst_tweets_10')
UnivPenn_offi_10

Unnamed: 0,freq,pct
health,492,1.067315
new,490,1.062976
medicine,422,0.915461
cancer,415,0.900276
learn,408,0.88509
care,300,0.650802
today,292,0.633447
research,283,0.613923
medical,267,0.579213
center,260,0.564028


#### 12 Barnes-Jewish Hospital, St. Louis Institutional Twitter Account

In [75]:
BJH_offi_12 = word_dist_in_pct('inst_tweets_12')
BJH_offi_12

Unnamed: 0,freq,pct
hospital,32,2.330663
free,27,1.966497
replacement,26,1.893664
construction,25,1.82083
watch,24,1.747997
new,19,1.383831
screening,18,1.310998
program,15,1.092498
steel,14,1.019665
click,13,0.946832


#### 13 Northwestern Memorial Hospital, Chicago Institutional Twitter Account

In [76]:
NMH_offi_13 = word_dist_in_pct('inst_tweets_13')
NMH_offi_13

Unnamed: 0,freq,pct
northwestern,162,2.194527
medicine,120,1.625576
hospital,105,1.422379
new,70,0.948253
learn,70,0.948253
today,62,0.839881
health,53,0.717963
cancer,51,0.69087
one,48,0.65023
care,47,0.636684


#### 14 UPMC Presbyterian Shadyside, Pittsburgh Institutional Twitter Account

In [77]:
UPMC_offi_14 = word_dist_in_pct('inst_tweets_14')
UPMC_offi_14

Unnamed: 0,freq,pct
health,746,1.343128
care,629,1.132476
learn,615,1.10727
new,491,0.884016
cancer,422,0.759785
today,380,0.684167
via,370,0.666163
us,360,0.648158
help,359,0.646358
center,318,0.57254


#### 15 University of Colorado Hospital, Aurora Institutional Twitter Account

In [78]:
UnivColo_offi_15 = word_dist_in_pct('inst_tweets_15')
UnivColo_offi_15

Unnamed: 0,freq,pct
medicine,35,3.286385
medical,26,2.441315
wilderness,20,1.877934
school,17,1.596244
colorado,17,1.596244
emergency,14,1.314554
new,14,1.314554
us,11,1.032864
check,11,1.032864
experience,11,1.032864


#### 16 Thomas Jefferson University Hospitals, Philadelphia Institutional Twitter Account

In [79]:
TJ_Univ_offi_16 = word_dist_in_pct('inst_tweets_16')
TJ_Univ_offi_16

Unnamed: 0,freq,pct
health,679,2.012985
cancer,457,1.354837
new,415,1.230322
learn,391,1.159171
us,324,0.960541
today,320,0.948682
care,312,0.924965
join,301,0.892354
center,300,0.88939
hospital,219,0.649254


#### 17 Duke University Hospital, Durham, North Carolina Institutional Twitter Account

In [80]:
Duke_offi_17 = word_dist_in_pct('inst_tweets_17')
Duke_offi_17

Unnamed: 0,freq,pct
duke,127,2.404392
new,74,1.400984
help,33,0.624763
world,29,0.549034
first,29,0.549034
part,27,0.51117
learn,27,0.51117
research,27,0.51117
see,26,0.492238
read,23,0.435441


#### 18 Mount Sinai Hospital, New York Institutional Twitter Account

In [81]:
MntSinai_offi_18 = word_dist_in_pct('inst_tweets_18')
MntSinai_offi_18

Unnamed: 0,freq,pct
new,3,2.307692
day,2,1.538462
ask,2,1.538462
intern,2,1.538462
hospital,2,1.538462
mount,2,1.538462
symposium,2,1.538462
value,2,1.538462
year,2,1.538462
communication,2,1.538462


#### 19 NYU Langone Medical Center, New York Institutional Twitter Account

In [82]:
NYUlang_offi_19 = word_dist_in_pct('inst_tweets_19')
NYUlang_offi_19

Unnamed: 0,freq,pct
health,334,2.031136
show,244,1.483824
live,230,1.398686
new,202,1.228412
us,148,0.900024
talk,134,0.814887
talking,115,0.699343
call,108,0.656775
tune,107,0.650693
today,107,0.650693


#### 20 Mayo Clinic Phoenix Institutional Twitter Account

In [83]:
MCP_offi_20 = word_dist_in_pct('inst_tweets_20')
MCP_offi_20

Unnamed: 0,freq,pct
clinic,213,2.847213
via,151,2.018447
learn,89,1.189681
care,70,0.935704
health,68,0.908969
heart,68,0.908969
new,61,0.815399
cancer,58,0.775297
research,56,0.748563
help,48,0.641625


#### Recreate a raw text file with all the words

In [84]:
def create_raw_text(filename):
    text = ''
    for i in range(10):
        text += ((filename['freq'].index[i] + ' ') * filename['freq'][i])
    return text

In [85]:
file = open('./inst_raw_text.txt', 'w')
inst_list = [MC_offi_1, CC_offi_2, JHop_offi_3, MGH_offi_4, UCSF_offi_5, UnivMich_offi_6, NYPH_offi_8, 
             Standford_offi_9, UnivPenn_offi_10, BJH_offi_12, NMH_offi_13, UPMC_offi_14, UnivColo_offi_15,
             TJ_Univ_offi_16, Duke_offi_17, MntSinai_offi_18, NYUlang_offi_19, MCP_offi_20]
for i in inst_list:
    file.write(create_raw_text(i))
file.close()

#### 01 Mayo Clinic, Rochester, Minnesota Individual Twitter Account

In [86]:
MC_indi_1 = word_dist_in_pct('indi_tweets_1')
MC_indi_1

Unnamed: 0,freq,pct
mindfulness,34,1.56682
us,28,1.290323
life,24,1.105991
help,21,0.967742
compassion,20,0.921659
brain,19,0.875576
day,18,0.829493
time,18,0.829493
resilience,17,0.78341
every,15,0.691244


#### 02 Cleveland Clinic Individual Twitter Account

In [87]:
CC_indi_2 = word_dist_in_pct('indi_tweets_2')
CC_indi_2

Unnamed: 0,freq,pct
owner,430,2.77706
manual,415,2.680186
us,375,2.421855
please,345,2.228106
rate,334,2.157065
week,330,2.131232
weekly,312,2.014983
ca,290,1.872901
wrong,290,1.872901
hope,282,1.821235


#### 03 Johns Hopkins Hospital, Baltimore Individual Twitter Account

In [88]:
JHop_indi_3 = word_dist_in_pct('indi_tweets_3')
JHop_indi_3

Unnamed: 0,freq,pct
great,57,1.324042
trauma,35,0.813008
care,35,0.813008
cancer,34,0.789779
thanks,32,0.743322
see,26,0.603949
bladder,26,0.603949
surgery,25,0.58072
us,24,0.557491
work,23,0.534262


#### 04 Massachusetts General Hospital Individual Twitter Account

In [89]:
MGH_indi_4 = word_dist_in_pct('indi_tweets_4')
MGH_indi_4

Unnamed: 0,freq,pct
study,109,1.149304
new,107,1.128216
care,97,1.022775
health,69,0.727541
via,67,0.706453
may,53,0.558836
piece,49,0.51666
people,49,0.51666
work,49,0.51666
use,48,0.506116


#### 05 UCSF Medical Center, San Francisco Individual Twitter Account

In [90]:
UCSF_indi_5 = word_dist_in_pct('indi_tweets_5')
UCSF_indi_5

Unnamed: 0,freq,pct
care,62,1.319711
great,47,1.000426
geriatrics,38,0.808855
older,35,0.744998
people,33,0.702427
health,32,0.681141
time,31,0.659855
work,28,0.595998
many,27,0.574713
need,26,0.553427


#### 06 University of Michigan Hospitals and Health Centers, Ann Arbor Individual Twitter Account

In [91]:
UnivMich_indi_6 = word_dist_in_pct('indi_tweets_6')
UnivMich_indi_6

Unnamed: 0,freq,pct
cancer,134,0.919509
pathology,127,0.871475
great,82,0.562684
new,81,0.555822
people,71,0.487202
cell,67,0.459754
prostate,62,0.425444
day,62,0.425444
one,62,0.425444
church,61,0.418582


#### 07 Ronald Reagan UCLA Medical Center, Los Angeles Individual Twitter Account

In [92]:
UCLA_indi_7 = word_dist_in_pct('indi_tweets_7')
UCLA_indi_7

Unnamed: 0,freq,pct
cancer,26,2.728227
disease,18,1.888772
today,18,1.888772
day,13,1.364113
batten,11,1.15425
awareness,10,1.049318
us,10,1.049318
please,9,0.944386
work,9,0.944386
meeting,9,0.944386


#### 09 Stanford Health Care-Stanford Hospital, Stanford, California Individual Twitter Account

In [93]:
Standford_indi_9 = word_dist_in_pct('indi_tweets_9')
Standford_indi_9

Unnamed: 0,freq,pct
health,96,1.494861
care,60,0.934288
data,54,0.84086
us,50,0.778574
via,44,0.685145
clinical,41,0.63843
medical,38,0.591716
work,38,0.591716
one,37,0.576145
medicine,35,0.545002


#### 10 Hospitals of the University of Pennsylvania-Penn Presbyterian, Philadelphia Individual Twitter Account

In [94]:
UnivPenn_indi_10 = word_dist_in_pct('indi_tweets_10')
UnivPenn_indi_10

Unnamed: 0,freq,pct
great,29,0.924745
new,24,0.765306
first,22,0.701531
ablation,21,0.669643
patient,18,0.57398
today,17,0.542092
one,16,0.510204
good,16,0.510204
via,15,0.478316
see,14,0.446429


#### 11 Cedars-Sinai Medical Center, Los Angeles Individual Twitter Account

In [95]:
CSM_indi_11 = word_dist_in_pct('indi_tweets_11')
CSM_indi_11

Unnamed: 0,freq,pct
great,26,0.976342
work,26,0.976342
us,26,0.976342
thanks,22,0.826136
cancer,20,0.751033
get,14,0.525723
help,14,0.525723
meeting,13,0.488171
via,13,0.488171
love,12,0.45062


#### 12 Barnes-Jewish Hospital, St. Louis Individual Twitter Account

In [96]:
BJH_indi_12 = word_dist_in_pct('indi_tweets_12')
BJH_indi_12

Unnamed: 0,freq,pct
great,6,1.704545
w,6,1.704545
meeting,4,1.136364
new,4,1.136364
prostate,4,1.136364
leadership,4,1.136364
need,3,0.852273
center,3,0.852273
leader,3,0.852273
hole,3,0.852273


#### 13 Northwestern Memorial Hospital, Chicago Individual Twitter Account

In [97]:
NMH_indi_13 = word_dist_in_pct('indi_tweets_13')
NMH_indi_13

Unnamed: 0,freq,pct
birthday,1,50.0
spending,1,50.0


#### 14 UPMC Presbyterian Shadyside, Pittsburgh Individual Twitter Account

In [98]:
UPMC_indi_14 = word_dist_in_pct('indi_tweets_14')
UPMC_indi_14

Unnamed: 0,freq,pct
via,130,1.538097
pain,85,1.005679
week,83,0.982016
see,83,0.982016
new,66,0.78088
thank,65,0.769049
care,61,0.721723
reach,53,0.627071
great,51,0.603407
twitter,48,0.567913


#### 15 University of Colorado Hospital, Aurora Individual Twitter Account

In [99]:
UnivColo_indi_15 = word_dist_in_pct('indi_tweets_15')
UnivColo_indi_15

Unnamed: 0,freq,pct
innovation,23,3.958692
great,12,2.065404
health,10,1.72117
care,10,1.72117
work,9,1.549053
digital,8,1.376936
thanks,7,1.204819
via,6,1.032702
excited,6,1.032702
virtual,5,0.860585


#### 16 Thomas Jefferson University Hospitals, Philadelphia Individual Twitter Account

In [100]:
TJ_Univ_indi_16 = word_dist_in_pct('indi_tweets_16')
TJ_Univ_indi_16

Unnamed: 0,freq,pct
cancer,232,1.57139
great,138,0.934706
new,124,0.839881
health,94,0.636684
today,92,0.623137
us,79,0.535085
team,78,0.528312
care,74,0.501219
research,68,0.46058
work,60,0.406394


#### 17 Duke University Hospital, Durham, N.C. Individual Twitter Account

In [101]:
Duke_indi_17 = word_dist_in_pct('indi_tweets_17')
Duke_indi_17

Unnamed: 0,freq,pct
great,260,1.37953
duke,255,1.353
new,170,0.902
research,132,0.700377
work,121,0.642012
cancer,111,0.588953
prostate,107,0.56773
urology,99,0.525283
us,88,0.466918
team,80,0.424471


#### 18 Mount Sinai Hospital, New York Individual Twitter Account

In [102]:
MntSinai_indi_18 = word_dist_in_pct('indi_tweets_18')
MntSinai_indi_18

Unnamed: 0,freq,pct
heart,82,1.584541
new,68,1.31401
great,42,0.811594
day,41,0.792271
time,37,0.714976
health,36,0.695652
us,30,0.57971
check,30,0.57971
surgery,29,0.560386
disease,27,0.521739


#### 19 NYU Langone Medical Center, New York Individual Twitter Account

In [103]:
NYUlang_indi_19 = word_dist_in_pct('indi_tweets_19')
NYUlang_indi_19

Unnamed: 0,freq,pct
diabetes,621,3.101898
health,547,2.732268
diet,197,0.984016
new,177,0.884116
may,164,0.819181
risk,139,0.694306
cancer,130,0.649351
one,130,0.649351
nutrition,118,0.589411
people,116,0.579421


#### 20 Mayo Clinic Phoenix Individual Twitter Account

In [104]:
MCP_indi_20 = word_dist_in_pct('indi_tweets_20')
MCP_indi_20

Unnamed: 0,freq,pct
migraine,33,6.626506
headache,14,2.811245
patient,6,1.204819
join,6,1.204819
advocacy,6,1.204819
concussion,5,1.004016
us,5,1.004016
global,5,1.004016
research,5,1.004016
day,5,1.004016


#### Recreate a raw text file with all the words

In [107]:
file = open('./indi_raw_text.txt', 'w')
indi_list = [MC_indi_1, CC_indi_2, JHop_indi_3, MGH_indi_4, UCSF_indi_5, UnivMich_indi_6, UCLA_indi_7, 
             Standford_indi_9, UnivPenn_indi_10, CSM_indi_11, BJH_indi_12, UPMC_indi_14, 
             UnivColo_indi_15, TJ_Univ_indi_16, Duke_indi_17, MntSinai_indi_18, NYUlang_indi_19, MCP_indi_20] 
# No 13 is not incuded becuase there is only 1 tweet in the file
for i in indi_list:
    file.write(create_raw_text(i))
file.close()