## Text search with English example
### Change these to reflect your data folder and search terms

In [1]:
document_folder = "documents/sotu/*.txt"

search_terms = ["war", "fight", "battle", "conflict", "clash", "defen", 
                "ceasefire", "attack", "seige", "front line", "frontline", 
                "troop", "army", "navy", "air force", "marine", "bomb"]

#This must be an excel/xlsx filename
results_filename = "results_sotu.xlsx"

#To show the full text, put "True" (capitalize 'T')
show_all_data = False 

### Install and import dependencies

In [2]:
!pip install openpyxl



In [3]:
import glob
import pandas as pd

if show_all_data is True:
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    
else:
    pd.set_option('display.max_rows', 100)
    pd.set_option('display.max_columns', 20)
    pd.set_option('display.width', 50)
    pd.set_option('display.max_colwidth', 500)

### Scan and load all text files

In [4]:
glob.glob(document_folder)

['documents/sotu/1912_Taft.txt',
 'documents/sotu/1907_Roosevelt.txt',
 'documents/sotu/1899_McKinley.txt',
 'documents/sotu/1913_Wilson.txt',
 'documents/sotu/1858_Buchanan.txt',
 'documents/sotu/1847_Polk.txt',
 'documents/sotu/1916_Wilson.txt',
 'documents/sotu/1928_Coolidge.txt',
 'documents/sotu/1904_Roosevelt.txt',
 'documents/sotu/1985_Reagan.txt',
 'documents/sotu/1936_Roosevelt.txt',
 'documents/sotu/1999_Clinton.txt',
 'documents/sotu/1938_Roosevelt.txt',
 'documents/sotu/1791_Washington.txt',
 'documents/sotu/1932_Hoover.txt',
 'documents/sotu/1921_Harding.txt',
 'documents/sotu/1799_Adams.txt',
 'documents/sotu/1815_Madison.txt',
 'documents/sotu/1934_Roosevelt.txt',
 'documents/sotu/1819_Monroe.txt',
 'documents/sotu/1948_Truman.txt',
 'documents/sotu/2017_Trump.txt',
 'documents/sotu/1931_Hoover.txt',
 'documents/sotu/1883_Arthur.txt',
 'documents/sotu/1872_Grant.txt',
 'documents/sotu/2006_Bush.txt',
 'documents/sotu/1811_Madison.txt',
 'documents/sotu/2002_Bush.txt',
 '

In [5]:
all_texts = {}

for filename in glob.glob(document_folder):
    with open(filename) as file:
        all_texts[filename] = file.read().lower()

### Convert to pandas dataframe

In [6]:
all_texts_df = pd.DataFrame(all_texts,index=[0]).T
all_texts_df.columns = ["text"]
all_texts_df.sample(5)

Unnamed: 0,text
documents/sotu/1833_Jackson.txt,"fellow citizens of the senate and of the house of representatives:\n\non your assembling to perform the high trusts which the people of the\nunited states have confided to you, of legislating for their common\nwelfare, it gives me pleasure to congratulate you upon the happy\ncondition of our beloved country. by the favor of divine providence\nhealth is again restored to us, peace reigns within our borders,\nabundance crowns the labors of our fields, commerce and domestic\nindustry flourish a..."
documents/sotu/1813_Madison.txt,"fellow-citizens of the senate and house of representatives:\n\nin meeting you at the present interesting conjuncture it would have been\nhighly satisfactory if i could have communicated a favorable result to the\nmission charged with negotiations for restoring peace. it was a just\nexpectation, from the respect due to the distinguished sovereign who had\ninvited them by his offer of mediation, from the readiness with which the\ninvitation was accepted on the part of the united states, and fr..."
documents/sotu/1791_Washington.txt,"fellow-citizens of the senate and house of representatives:\n\n""in vain may we expect peace with the indians on our frontiers so long as a\nlawless set of unprincipled wretches can violate the rights of hospitality,\nor infringe the most solemn treaties, without receiving the punishment they\nso justly merit.""\n\ni meet you upon the present occasion with the feelings which are naturally\ninspired by a strong impression of the prosperous situations of our common\ncountry, and by a persuasion ..."
documents/sotu/1819_Monroe.txt,"fellow-citizens of the senate and house of representatives:\n\nthe public buildings being advanced to a stage to afford accommodation for\ncongress, i offer you my sincere congratulations on the recommencement of\nyour duties in the capitol.\n\nin bringing you to view the incidents most deserving attention which have\noccurred since your last session, i regret to have to state that several of\nour principal cities have suffered by sickness, that an unusual drought has\nprevailed in the middl..."
documents/sotu/1906_Roosevelt.txt,"to the senate and house of representatives:\n\nas a nation we still continue to enjoy a literally unprecedented\nprosperity; and it is probable that only reckless speculation and\ndisregard of legitimate business methods on the part of the business\nworld can materially mar this prosperity.\n\nno congress in our time has done more good work of importance than the\npresent congress. there were several matters left unfinished at your\nlast session, however, which i most earnestly hope you will..."


### Calculate word counts

In [7]:
def word_count(text):
    return len(text.split(" "))

In [8]:
all_texts_df['text'].apply(word_count)[0:5]

documents/sotu/1912_Taft.txt         22933
documents/sotu/1907_Roosevelt.txt    24968
documents/sotu/1899_McKinley.txt     13779
documents/sotu/1913_Wilson.txt        3265
documents/sotu/1858_Buchanan.txt     14946
Name: text, dtype: int64

In [9]:
all_texts_df['word count'] = all_texts_df['text'].apply(word_count)
all_texts_df.sample(5)

Unnamed: 0,text,word count
documents/sotu/1817_Monroe.txt,"fellow-citizens of the senate and house of representatives:\n\nat no period of our political existence had we so much cause to felicitate\nourselves at the prosperous and happy condition of our country. the\nabundant fruits of the earth have filled it with plenty. an extensive and\nprofitable commerce has greatly augmented our revenue. the public credit\nhas attained an extraordinary elevation. our preparations for defense in\ncase of future wars, from which, by the experience of all nations...",4045
documents/sotu/1892_Harrison.txt,"to the senate and house of representatives:\n\nin submitting my annual message to congress i have great satisfaction in\nbeing able to say that the general conditions affecting the commercial and\nindustrial interests of the united states are in the highest degree\nfavorable. a comparison of the existing conditions with those of the most\nfavored period in the history of the country will, i believe, show that so\nhigh a degree of prosperity and so general a diffusion of the comforts of\nlife...",12510
documents/sotu/1864_Lincoln.txt,"fellow-citizens of the senate and house of representatives:\n\nagain the blessings of health and abundant harvests claim our\nprofoundest gratitude to almighty god.\n\nthe condition of our foreign affairs is reasonably satisfactory.\n\nmexico continues to be a theater of civil war. while our political\nrelations with that country have undergone no change, we have at the\nsame time strictly maintained neutrality between the belligerents.\n\nat the request of the states of costa rica and nicar...",5424
documents/sotu/1925_Coolidge.txt,"members of the congress:\n\nin meeting the constitutional requirement of informing the congress upon\nthe state of the union, it is exceedingly gratifying to report that the\ngeneral condition is one of progress and prosperity. here and there are\ncomparatively small and apparently temporary difficulties needing\nadjustment and improved administrative methods, such as are always to be\nexpected, but in the fundamentals of government and business the results\ndemonstrate that we are going in ...",9870
documents/sotu/1861_Lincoln.txt,"fellow-citizens of the senate and house of representatives:\n\nin the midst of unprecedented political troubles we have cause of great\ngratitude to god for unusual good health and most abundant harvests.\n\nyou will not be surprised to learn that in the peculiar exigencies of\nthe times our intercourse with foreign nations has been attended with\nprofound solicitude, chiefly turning upon our own domestic affairs.\n\na disloyal portion of the american people have during the whole year\nbeen ...",6340


### Calculate counts for search terms

In [10]:
search_terms

['war',
 'fight',
 'battle',
 'conflict',
 'clash',
 'defen',
 'ceasefire',
 'attack',
 'seige',
 'front line',
 'frontline',
 'troop',
 'army',
 'navy',
 'air force',
 'marine',
 'bomb']

In [11]:
for term in search_terms:
    all_texts_df[term] = ""
    
all_texts_df.sample(5)

Unnamed: 0,text,word count,war,fight,battle,conflict,clash,defen,ceasefire,attack,seige,front line,frontline,troop,army,navy,air force,marine,bomb
documents/sotu/1896_Cleveland.txt,"to the congress of the united states:\n\nas representatives of the people in the legislative branch of their\ngovernment, you have assembled at a time when the strength and excellence of\nour free institutions and the fitness of our citizens to enjoy popular rule\nhave been again made manifest. a political contest involving momentous\nconsequences, fraught with feverish apprehension, and creating aggressiveness\nso intense as to approach bitterness and passion has been waged throughout our\n...",14109,,,,,,,,,,,,,,,,,
documents/sotu/1811_Madison.txt,"fellow-citizens of the senate and house of representatives:\n\nin calling you together sooner than a separation from your homes would\notherwise have been required i yielded to considerations drawn from the\nposture of our foreign affairs, and in fixing the present for the time of\nyour meeting regard was had to the probability of further developments of\nthe policy of the belligerent powers toward this country which might the\nmore unite the national councils in the measures to be pursued.\...",2072,,,,,,,,,,,,,,,,,
documents/sotu/1883_Arthur.txt,"to the congress of the united states:\n\nat the threshold of your deliberations i congratulate you upon the\nfavorable aspect of the domestic and foreign affairs of this government.\n\nour relations with other countries continue to be upon a friendly footing.\nwith the argentine republic, austria, belgium, brazil, denmark, hayti,\nitaly, santo domingo, and sweden and norway no incident has occurred which\ncalls for special comment. the recent opening of new lines of telegraphic\ncommunicatio...",3413,,,,,,,,,,,,,,,,,
documents/sotu/1920_Wilson.txt,"gentlemen of the congress:\n\nwhen i addressed myself to performing the duty laid upon the president by\nthe constitution to present to you an annual report on the state of the\nunion, i found my thought dominated by an immortal sentence of abraham\nlincoln's--""let us have faith that right makes might, and in that faith let\nus dare to do our duty as we understand it""--a sentence immortal because it\nembodies in a form of utter simplicity and purity the essential faith of\nthe nation, the fa...",2474,,,,,,,,,,,,,,,,,
documents/sotu/1869_Grant.txt,"to the senate and house of representatives:\n\nin coming before you for the first time as chief magistrate of this great\nnation, it is with gratitude to the giver of all good for the many benefits\nwe enjoy. we are blessed with peace at home, and are without entangling\nalliances abroad to forebode trouble; with a territory unsurpassed in\nfertility, of an area equal to the abundant support of 500,000,000 people,\nand abounding in every variety of useful mineral in quantity sufficient to\ns...",7040,,,,,,,,,,,,,,,,,


In [12]:
for filename, text in all_texts_df.iterrows():
    #print(filename)
    for term in search_terms:
        #print(term, text[0].count(term))
        all_texts_df.loc[filename,term] = text[0].count(term)

In [13]:
all_texts_df.sample(5)

Unnamed: 0,text,word count,war,fight,battle,conflict,clash,defen,ceasefire,attack,seige,front line,frontline,troop,army,navy,air force,marine,bomb
documents/sotu/1845_Polk.txt,"fellow-citizens of the senate and of the house of representatives:\n\nit is to me a source of unaffected satisfaction to meet the representatives\nof the states and the people in congress assembled, as it will be to\nreceive the aid of their combined wisdom in the administration of public\naffairs. in performing for the first time the duty imposed on me by the\nconstitution of giving to you information of the state of the union and\nrecommending to your consideration such measures as in my j...",14755,35,0,0,1,0,16,0,2,0,0,0,3,10,8,0,0,0
documents/sotu/1988_Reagan.txt,"mr. speaker, mr. president, and distinguished members of the house and\nsenate: when we first met here 7 years ago--many of us for the first\ntime--it was with the hope of beginning something new for america. we meet\nhere tonight in this historic chamber to continue that work. if anyone\nexpects just a proud recitation of the accomplishments of my\nadministration, i say let's leave that to history; we're not finished yet.\nso, my message to you tonight is put on your work shoes; we're still...",4443,14,6,1,0,0,12,0,1,0,0,0,1,1,0,0,0,0
documents/sotu/1870_Grant.txt,"to the senate and house of representatives:\n\na year of peace and general prosperity to this nation has passed since the\nlast assembling of congress. we have, through a kind providence, been\nblessed with abundant crops, and have been spared from complications and\nwar with foreign nations. in our midst comparative harmony has been\nrestored. it is to be regretted, however, that a free exercise of the\nelective franchise has by violence and intimidation been denied to citizens\nin exceptio...",7994,31,0,0,1,0,0,0,0,0,0,0,0,6,6,0,4,0
documents/sotu/1878_Hayes.txt,"fellow-citizens of the senate and house of representatives:\n\nour heartfelt gratitude is due to the divine being who holds in his hands\nthe destinies of nations for the continued bestowal during the last year of\ncountless blessings upon our country.\n\nwe are at peace with all other nations. our public credit has greatly\nimproved, and is perhaps now stronger than ever before. abundant harvests\nhave rewarded the labors of those who till the soil, our manufacturing\nindustries are revivin...",7181,14,0,0,2,0,1,0,0,0,0,0,3,5,2,0,1,0
documents/sotu/1806_Jefferson.txt,"the senate and house of representatives of the united states:\n\nit would have given me, fellow citizens, great satisfaction to announce in\nthe moment of your meeting that the difficulties in our foreign relations\nexisting at the time of your last separation had been amicably and justly\nterminated. i lost no time in taking those measures which were most likely\nto bring them to such a termination--by special missions charged with such\npowers and instructions as in the event of failure co...",2622,14,0,0,0,0,6,0,0,0,0,0,1,0,0,0,0,0


### Counts of all keywords

In [14]:
all_texts_df.iloc[:,2:].sample(5)

Unnamed: 0,war,fight,battle,conflict,clash,defen,ceasefire,attack,seige,front line,frontline,troop,army,navy,air force,marine,bomb
documents/sotu/1814_Madison.txt,17,0,0,0,0,4,0,2,0,0,0,2,1,0,0,0,0
documents/sotu/1791_Washington.txt,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
documents/sotu/1845_Polk.txt,35,0,0,1,0,16,0,2,0,0,0,3,10,8,0,0,0
documents/sotu/1906_Roosevelt.txt,69,5,5,5,0,14,0,6,0,0,0,6,15,24,0,2,0
documents/sotu/1821_Monroe.txt,2,0,0,0,0,3,0,0,0,0,0,1,0,3,0,0,0


In [15]:
all_texts_df.iloc[:,2:].apply(sum,axis=1)[0:5]

documents/sotu/1912_Taft.txt         150
documents/sotu/1907_Roosevelt.txt    179
documents/sotu/1899_McKinley.txt      73
documents/sotu/1913_Wilson.txt         8
documents/sotu/1858_Buchanan.txt      56
dtype: int64

In [16]:
all_texts_df['all keyword count'] = all_texts_df.iloc[:,2:].apply(sum,axis=1)
all_texts_df.sample(5)

Unnamed: 0,text,word count,war,fight,battle,conflict,clash,defen,ceasefire,attack,seige,front line,frontline,troop,army,navy,air force,marine,bomb,all keyword count
documents/sotu/1883_Arthur.txt,"to the congress of the united states:\n\nat the threshold of your deliberations i congratulate you upon the\nfavorable aspect of the domestic and foreign affairs of this government.\n\nour relations with other countries continue to be upon a friendly footing.\nwith the argentine republic, austria, belgium, brazil, denmark, hayti,\nitaly, santo domingo, and sweden and norway no incident has occurred which\ncalls for special comment. the recent opening of new lines of telegraphic\ncommunicatio...",3413,8,1,0,1,0,0,0,0,0,0,0,1,0,3,0,0,0,14
documents/sotu/1795_Washington.txt,"fellow-citizens of the senate and house of representatives:\n\ni trust i do not deceive myself when i indulge the persuasion that i have\nnever met you at any period when more than at the present the situation of\nour public affairs has afforded just cause for mutual congratulation, and\nfor inviting you to join with me in profound gratitude to the author of all\ngood for the numerous and extraordinary blessings we enjoy.\n\nthe termination of the long, expensive, and distressing war in whic...",1806,6,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,8
documents/sotu/1894_Cleveland.txt,"to the congress of the united states:\n\nthe assemblage within the nation's legislative halls of those charged with the\nduty of making laws for the benefit of a generous and free people impressively\nsuggests the exacting obligation and inexorable responsibility involved in\ntheir task. at the threshold of such labor now to be undertaken by the\ncongress of the united states, and in the discharge of an executive duty\nenjoined by the constitution, i submit this communication, containing a b...",14509,42,0,6,1,0,3,0,0,0,0,0,3,15,18,0,1,0,89
documents/sotu/1850_Fillmore.txt,"fellow-citizens of the senate and of the house of representatives:\n\nbeing suddenly called in the midst of the last session of congress by a\npainful dispensation of divine providence to the responsible station which\ni now hold, i contented myself with such communications to the legislature\nas the exigency of the moment seemed to require. the country was shrouded\nin mourning for the loss of its venerable chief magistrate and all hearts\nwere penetrated with grief. neither the time nor th...",7605,12,0,0,2,0,1,0,0,0,0,0,0,3,8,0,0,0,26
documents/sotu/1889_Harrison.txt,"to the senate and house of representatives:\n\nthere are few transactions in the administration of the government that are\neven temporarily held in the confidence of those charged with the conduct\nof the public business. every step taken is under the observation of an\nintelligent and watchful people. the state of the union is known from day\nto day, and suggestions as to needed legislation find an earlier voice than\nthat which speaks in these annual communications of the president to\nco...",11861,24,2,3,2,0,5,0,1,0,0,0,3,4,6,0,3,0,53


### Counts of any keywords (1 if any, 0 if none)

In [17]:
def not_zero(number):
    if number == 0:
        return 0
    else:
        return 1

In [18]:
all_texts_df['all keyword count'].apply(not_zero)

documents/sotu/1912_Taft.txt         1
documents/sotu/1907_Roosevelt.txt    1
documents/sotu/1899_McKinley.txt     1
documents/sotu/1913_Wilson.txt       1
documents/sotu/1858_Buchanan.txt     1
                                    ..
documents/sotu/1876_Grant.txt        1
documents/sotu/1805_Jefferson.txt    1
documents/sotu/1864_Lincoln.txt      1
documents/sotu/1869_Grant.txt        1
documents/sotu/1840_Buren.txt        1
Name: all keyword count, Length: 190, dtype: int64

In [19]:
all_texts_df['has any keyword'] = all_texts_df['all keyword count'].apply(not_zero)
all_texts_df

Unnamed: 0,text,word count,war,fight,battle,conflict,clash,defen,ceasefire,attack,...,front line,frontline,troop,army,navy,air force,marine,bomb,all keyword count,has any keyword
documents/sotu/1912_Taft.txt,"part i\n\nto the senate and house of representatives:\n\nthe foreign relations of the united states actually and potentially affect\nthe state of the union to a degree not widely realized and hardly surpassed\nby any other factor in the welfare of the whole nation. the position of the\nunited states in the moral, intellectual, and material relations of the\nfamily of nations should be a matter of vital interest to every patriotic\ncitizen. the national prosperity and power impose upon us dut...",22933,59,0,8,4,0,10,0,2,...,0,0,4,31,23,0,8,1,150,1
documents/sotu/1907_Roosevelt.txt,"to the senate and house of representatives:\n\nno nation has greater resources than ours, and i think it can be\ntruthfully said that the citizens of no nation possess greater energy\nand industrial ability. in no nation are the fundamental business\nconditions sounder than in ours at this very moment; and it is foolish,\nwhen such is the case, for people to hoard money instead of keeping it\nin sound banks; for it is such hoarding that is the immediate occasion\nof money stringency. moreove...",24968,80,2,17,2,0,5,0,3,...,0,0,1,36,31,0,2,0,179,1
documents/sotu/1899_McKinley.txt,"to the senate and house of representatives:\n\nat the threshold of your deliberations you are called to mourn with your\ncountrymen the death of vice-president hobart, who passed from this life on\nthe morning of november 21 last. his great soul now rests in eternal peace.\nhis private life was pure and elevated, while his public career was ever\ndistinguished by large capacity, stainless integrity, and exalted motives.\nhe has been removed from the high office which he honored and dignified...",13779,35,0,1,3,0,1,0,0,...,0,0,4,14,9,0,6,0,73,1
documents/sotu/1913_Wilson.txt,"gentlemen of the congress:\n\nin pursuance of my constitutional duty to ""give to the congress information\nof the state of the union,"" i take the liberty of addressing you on several\nmatters which ought, as it seems to me, particularly to engage the\nattention of your honorable bodies, as of all who study the welfare and\nprogress of the nation.\n\ni shall ask your indulgence if i venture to depart in some degree from the\nusual custom of setting before you in formal review the many matters...",3265,7,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,8,1
documents/sotu/1858_Buchanan.txt,"fellow-citizens of the senate and house of representatives:\n\nwhen we compare the condition of the country at the present day with what\nit was one year ago at the meeting of congress, we have much reason for\ngratitude to that almighty providence which has never failed to interpose\nfor our relief at the most critical periods of our history. one year ago\nthe sectional strife between the north and the south on the dangerous\nsubject of slavery had again become so intense as to threaten the...",14946,34,0,0,1,0,1,0,2,...,0,0,7,5,6,0,0,0,56,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
documents/sotu/1876_Grant.txt,"to the senate and house of representatives:\n\nin submitting my eighth and last annual message to congress it seems proper\nthat i should refer to and in some degree recapitulate the events and\nofficial acts of the past eight years.\n\nit was my fortune, or misfortune, to be called to the office of chief\nexecutive without any previous political training. from the age of 17 i had\nnever even witnessed the excitement attending a presidential campaign but\ntwice antecedent to my own candidacy...",6197,21,0,0,1,0,1,0,0,...,0,0,2,6,7,0,0,0,38,1
documents/sotu/1805_Jefferson.txt,"the senate and house of representatives of the united states:\n\nat a moment when the nations of europe are in commotion and arming against\neach other, and when those with whom we have principal intercourse are\nengaged in the general contest, and when the countenance of some of them\ntoward our peaceable country threatens that even that may not be unaffected\nby what is passing on the general theater, a meeting of the representatives\nof the nation in both houses of congress has become mor...",2698,12,0,0,1,0,2,0,1,...,0,0,2,0,0,0,0,0,18,1
documents/sotu/1864_Lincoln.txt,"fellow-citizens of the senate and house of representatives:\n\nagain the blessings of health and abundant harvests claim our\nprofoundest gratitude to almighty god.\n\nthe condition of our foreign affairs is reasonably satisfactory.\n\nmexico continues to be a theater of civil war. while our political\nrelations with that country have undergone no change, we have at the\nsame time strictly maintained neutrality between the belligerents.\n\nat the request of the states of costa rica and nicar...",5424,36,1,2,0,0,1,0,0,...,0,0,0,4,11,0,0,0,55,1
documents/sotu/1869_Grant.txt,"to the senate and house of representatives:\n\nin coming before you for the first time as chief magistrate of this great\nnation, it is with gratitude to the giver of all good for the many benefits\nwe enjoy. we are blessed with peace at home, and are without entangling\nalliances abroad to forebode trouble; with a territory unsurpassed in\nfertility, of an area equal to the abundant support of 500,000,000 people,\nand abounding in every variety of useful mineral in quantity sufficient to\ns...",7040,25,0,0,1,0,0,0,0,...,0,0,3,8,3,0,2,0,42,1


### Export to Excel

In [20]:
all_texts_df.to_excel(results_filename)