In [1]:
#a function for cleaning up and counting the words of text data from multiple source files
def popular_words():
    
    #importing the required modules
    import glob
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer #root words
    from nltk.probability import FreqDist
    import re
    import pandas as pd
    
    #extracting the file names that contain the text from the folder 
    files = []
    for filename in glob.glob("*.txt"):
        files.append(filename)
    
    #removing spaces, html tags and special characters and combining the text from each file into a list 
    ad_data = []
    for name in files:
         with open(name) as f:
            for line in f:
                split = line.split()
                line = re.sub('<[^<]+?>', '', line)
                line = re.sub("[^a-zA-Z0-9]+", ' ', line)
                line = line.lower()
                if not line.isspace(): 
                    ad_data.append(line)
    
    '''remove the common words such as 'the', 'their', etc using stopwords, extract the root words using stemmer and 
    create a dictionary with the counts for each words'''
    words = {}
    for line in ad_data:
        ps = PorterStemmer()
        for word in line.split():
            if word not in set(stopwords.words('english')):
                word = ps.stem(word)
                if word not in words:
                    words[word] = 1
                else:
                    words[word] += 1
    
    #sorting the word count dictionary in descending order of count 
    word_count = sorted(words.items(), key=lambda x: x[1], reverse = True)
    
    return word_count

popular_words()

[('call', 19),
 ('amp', 18),
 ('girl', 17),
 ('asian', 16),
 ('new', 14),
 ('bodi', 12),
 ('real', 11),
 ('hot', 9),
 ('relax', 9),
 ('sexi', 9),
 ('time', 9),
 ('come', 8),
 ('guarante', 7),
 ('100', 7),
 ('special', 7),
 ('sweet', 6),
 ('pleas', 6),
 ('massag', 6),
 ('7', 6),
 ('text', 6),
 ('nice', 6),
 ('avail', 6),
 ('best', 5),
 ('5', 5),
 ('24', 5),
 ('beauti', 5),
 ('clean', 5),
 ('incal', 5),
 ('day', 5),
 ('satisfi', 5),
 ('look', 5),
 ('sensual', 5),
 ('alway', 5),
 ('block', 5),
 ('im', 5),
 ('pic', 4),
 ('u', 4),
 ('back', 4),
 ('play', 4),
 ('rush', 4),
 ('pictur', 4),
 ('2', 4),
 ('x', 4),
 ('nude', 4),
 ('nuru', 4),
 ('skill', 4),
 ('ot', 4),
 ('readi', 4),
 ('wait', 4),
 ('let', 4),
 ('perfect', 4),
 ('make', 4),
 ('discreet', 4),
 ('servic', 4),
 ('334', 3),
 ('10', 3),
 ('donat', 3),
 ('200', 3),
 ('hr', 3),
 ('place', 3),
 ('spa', 3),
 ('455', 3),
 ('open', 3),
 ('shower', 3),
 ('leav', 3),
 ('hey', 3),
 ('fantasi', 3),
 ('provid', 3),
 ('get', 3),
 ('experi', 3),
 

##### results displayed as a list of tuples

In [None]:
'''[('call', 19),
 ('amp', 18),
 ('girl', 17),
 ('asian', 16),
 ('new', 14),
 ('bodi', 12),
 ('real', 11),
 ('hot', 9),
 ('relax', 9),
 ('sexi', 9),
 ('time', 9),
 ('come', 8),
 ('guarante', 7),
 ('100', 7),
 ('special', 7),
 ('sweet', 6),
 ('pleas', 6),
 ('massag', 6),
 ('7', 6),
 ('text', 6),
 ('nice', 6),
 ('avail', 6),
 ('best', 5),
 ('5', 5),
 ('24', 5),
 ('beauti', 5),
 ('clean', 5),
 ('incal', 5),
 ('day', 5),
 ('satisfi', 5),
 ('look', 5),
 ('sensual', 5),
 ('alway', 5),
 ('block', 5),
 ('im', 5),
 ('pic', 4),
 ('u', 4),
 ('back', 4),
 ('play', 4),
 ('rush', 4),
 ('pictur', 4),
 ('2', 4),
 ('x', 4),
 ('nude', 4),
 ('nuru', 4),
 ('skill', 4),
 ('ot', 4),
 ('readi', 4),
 ('wait', 4),
 ('let', 4),
 ('perfect', 4),
 ('make', 4),
 ('discreet', 4),
 ('servic', 4),
 ('334', 3),
 ('10', 3),
 ('donat', 3),
 ('200', 3),
 ('hr', 3),
 ('place', 3),
 ('spa', 3),
 ('455', 3),
 ('open', 3),
 ('shower', 3),
 ('leav', 3),
 ('hey', 3),
 ('fantasi', 3),
 ('provid', 3),
 ('get', 3),
 ('experi', 3),
 ('need', 3),
 ('outcal', 3),
 ('com', 3),
 ('japanes', 3),
 ('korean', 3),
 ('eleg', 3),
 ('like', 3),
 ('ladi', 3),
 ('unforgett', 3),
 ('take', 3),
 ('care', 3),
 ('session', 3),
 ('fetish', 3),
 ('vip', 3),
 ('escort', 3),
 ('well', 3),
 ('925', 2),
 ('0752', 2),
 ('locat', 2),
 ('33', 2),
 ('love', 2),
 ('cici', 2),
 ('natur', 2),
 ('34d', 2),
 ('exclus', 2),
 ('environ', 2),
 ('respect', 2),
 ('00pm', 2),
 ('140', 2),
 ('arriv', 2),
 ('pine', 2),
 ('fantast', 2),
 ('678', 2),
 ('8840', 2),
 ('town', 2),
 ('relief', 2),
 ('upscal', 2),
 ('facil', 2),
 ('refresh', 2),
 ('promis', 2),
 ('b', 2),
 ('map', 2),
 ('sister', 2),
 ('free', 2),
 ('desir', 2),
 ('met', 2),
 ('us', 2),
 ('ask', 2),
 ('great', 2),
 ('stop', 2),
 ('2girl', 2),
 ('contact', 2),
 ('kisskisspop', 2),
 ('212', 2),
 ('websit', 2),
 ('video', 2),
 ('chines', 2),
 ('style', 2),
 ('bodyrub', 2),
 ('24hr', 2),
 ('life', 2),
 ('408', 2),
 ('san', 2),
 ('jose', 2),
 ('guy', 2),
 ('347', 2),
 ('woman', 2),
 ('give', 2),
 ('set', 2),
 ('tiffani', 2),
 ('85o', 2),
 ('380', 2),
 ('person', 2),
 ('keep', 2),
 ('littl', 2),
 ('seriou', 2),
 ('someth', 2),
 ('independ', 2),
 ('soft', 2),
 ('atmospher', 2),
 ('g', 2),
 ('man', 2),
 ('satisfact', 2),
 ('4', 2),
 ('choos', 2),
 ('908', 2),
 ('688', 2),
 ('0666', 2),
 ('559', 2),
 ('835', 2),
 ('9240', 2),
 ('ny', 2),
 ('welcom', 2),
 ('amaz', 2),
 ('eye', 2),
 ('true', 2),
 ('http', 2),
 ('60', 2),
 ('bit', 2),
 ('good', 2),
 ('openmind', 2),
 ('l', 2),
 ('r', 2),
 ('unrush', 2),
 ('fun', 2),
 ('izzibella', 2),
 ('review', 2),
 ('offer', 2),
 ('sizzil', 1),
 ('brand', 1),
 ('yg', 1),
 ('super', 1),
 ('star', 1),
 ('susan', 1),
 ('23', 1),
 ('athelat', 1),
 ('frame', 1),
 ('32c', 1),
 ('22', 1),
 ('27', 1),
 ('magic', 1),
 ('hand', 1),
 ('big', 1),
 ('breast', 1),
 ('polit', 1),
 ('gentlemen', 1),
 ('00am', 1),
 ('11', 1),
 ('2xvip', 1),
 ('160', 1),
 ('half', 1),
 ('break', 1),
 ('negoci', 1),
 ('trade', 1),
 ('anykind', 1),
 ('plain', 1),
 ('view', 1),
 ('upon', 1),
 ('health', 1),
 ('hottest', 1),
 ('today', 1),
 ('9am', 1),
 ('dri', 1),
 ('sauna', 1),
 ('stress', 1),
 ('art', 1),
 ('feel', 1),
 ('complet', 1),
 ('renew', 1),
 ('reliev', 1),
 ('pamper', 1),
 ('102', 1),
 ('coloni', 1),
 ('park', 1),
 ('dr', 1),
 ('ste', 1),
 ('500', 1),
 ('c', 1),
 ('um', 1),
 ('ga', 1),
 ('30040', 1),
 ('googl', 1),
 ('ky', 1),
 ('limit', 1),
 ('boy', 1),
 ('fullfil', 1),
 ('wildest', 1),
 ('friend', 1),
 ('drama', 1),
 ('mind', 1),
 ('tri', 1),
 ('anyth', 1),
 ('enjoy', 1),
 ('watch', 1),
 ('happi', 1),
 ('money', 1),
 ('513', 1),
 ('658', 1),
 ('1473', 1),
 ('eboni', 1),
 ('cuti', 1),
 ('fella', 1),
 ('know', 1),
 ('miss', 1),
 ('chocol', 1),
 ('carmen', 1),
 ('323', 1),
 ('7666', 1),
 ('game', 1),
 ('photo', 1),
 ('tableshow', 1),
 ('354', 1),
 ('1279', 1),
 ('see', 1),
 ('shot', 1),
 ('midtown', 1),
 ('43', 1),
 ('street', 1),
 ('ave', 1),
 ('9girl', 1),
 ('tabl', 1),
 ('steami', 1),
 ('bodyslid', 1),
 ('extra', 1),
 ('beyond', 1),
 ('fusion', 1),
 ('bring', 1),
 ('sooth', 1),
 ('ach', 1),
 ('nourish', 1),
 ('soul', 1),
 ('rainbow', 1),
 ('favorit', 1),
 ('ambianc', 1),
 ('rejuven', 1),
 ('spirit', 1),
 ('scam', 1),
 ('tel', 1),
 ('448', 1),
 ('6868', 1),
 ('daili', 1),
 ('10am', 1),
 ('10pm', 1),
 ('week', 1),
 ('direct', 1),
 ('4650', 1),
 ('pearl', 1),
 ('avenu', 1),
 ('ca', 1),
 ('95136', 1),
 ('lo', 1),
 ('gato', 1),
 ('campbel', 1),
 ('santa', 1),
 ('clara', 1),
 ('saratoga', 1),
 ('40', 1),
 ('dominicana', 1),
 ('hola', 1),
 ('masiel', 1),
 ('phone', 1),
 ('299', 1),
 ('4436', 1),
 ('9990', 1),
 ('never', 1),
 ('amswer', 1),
 ('ok', 1),
 ('exactli', 1),
 ('longer', 1),
 ('7078150278', 1),
 ('sarah', 1),
 ('n0w', 1),
 ('9045', 1),
 ('38o', 1),
 ('9o45', 1),
 ('carvisit', 1),
 ('passion', 1),
 ('personalizecompanionship', 1),
 ('finest', 1),
 ('classi', 1),
 ('young', 1),
 ('bubbl', 1),
 ('eas', 1),
 ('got', 1),
 ('tight', 1),
 ('curv', 1),
 ('right', 1),
 ('explicit', 1),
 ('talk', 1),
 ('4255985946', 1),
 ('class', 1),
 ('stylish', 1),
 ('tender', 1),
 ('skin', 1),
 ('companion', 1),
 ('warm', 1),
 ('proud', 1),
 ('figur', 1),
 ('425', 1),
 ('598', 1),
 ('5946', 1),
 ('cr', 1),
 ('curvac', 1),
 ('freaki', 1),
 ('everi', 1),
 ('mission', 1),
 ('accomplish', 1),
 ('experienc', 1),
 ('far', 1),
 ('reserv', 1),
 ('kelli', 1),
 ('8432899488', 1),
 ('8', 1),
 ('therapist', 1),
 ('erot', 1),
 ('mutual', 1),
 ('touch', 1),
 ('roleplay', 1),
 ('minut', 1),
 ('away', 1),
 ('newark', 1),
 ('airport', 1),
 ('appoint', 1),
 ('voluptu', 1),
 ('marina', 1),
 ('myale', 1),
 ('babi', 1),
 ('curvi', 1),
 ('juici', 1),
 ('pleasur', 1),
 ('breath', 1),
 ('1hour', 1),
 ('120', 1),
 ('30min', 1),
 ('15min100', 1),
 ('treat', 1),
 ('law', 1),
 ('enforc', 1),
 ('babe', 1),
 ('5598359240', 1),
 ('greet', 1),
 ('smile', 1),
 ('unmatch', 1),
 ('qualiti', 1),
 ('much', 1),
 ('better', 1),
 ('www', 1),
 ('nyvipasian', 1),
 ('646', 1),
 ('596', 1),
 ('5068', 1),
 ('show', 1),
 ('taken', 1),
 ('cold', 1),
 ('night', 1),
 ('lollipop', 1),
 ('also', 1),
 ('kimmi', 1),
 ('shi', 1),
 ('say', 1),
 ('hi', 1),
 ('compani', 1),
 ('sure', 1),
 ('disappoint', 1),
 ('one', 1),
 ('laugh', 1),
 ('3', 1),
 ('lb', 1),
 ('36c', 1),
 ('blue', 1),
 ('green', 1),
 ('light', 1),
 ('brown', 1),
 ('hair', 1),
 ('friendli', 1),
 ('inquir', 1),
 ('4015451501', 1),
 ('honey', 1),
 ('lisa', 1),
 ('genuin', 1),
 ('extrem', 1),
 ('proven', 1),
 ('stand', 1),
 ('info', 1),
 ('991', 1),
 ('3881', 1),
 ('deep', 1),
 ('french', 1),
 ('kiss', 1),
 ('f', 1),
 ('140hr', 1),
 ('120hhr', 1),
 ('sorri', 1),
 ('black', 1),
 ('gentleman', 1),
 ('pl', 1),
 ('biggest', 1),
 ('agent', 1),
 ('york', 1),
 ('citi', 1),
 ('china', 1),
 ('japan', 1),
 ('korea', 1),
 ('hong', 1),
 ('kong', 1),
 ('taiwan', 1),
 ('train', 1),
 ('differ', 1),
 ('requir', 1),
 ('area', 1),
 ('includ', 1),
 ('manhattan', 1),
 ('queen', 1),
 ('brooklyn', 1),
 ('long', 1),
 ('island', 1),
 ('jersey', 1),
 ('rate', 1),
 ('nypinkasian', 1),
 ('7outcal', 1),
 ('7335', 1),
 ('student', 1),
 ('parti', 1),
 ('read', 1),
 ('ad', 1),
 ('rn', 1),
 ('ng', 1),
 ('ulti', 1),
 ('fr', 1),
 ('k', 1),
 ('juic', 1),
 ('ty', 1),
 ('companionship', 1),
 ('total', 1),
 ('packag', 1),
 ('secret', 1),
 ('recent', 1),
 ('hygien', 1),
 ('must', 1),
 ('diamond', 1),
 ('570', 1),
 ('415', 1),
 ('9143', 1),
 ('handl', 1),
 ('next', 1),
 ('door', 1),
 ('wild', 1),
 ('side', 1),
 ('find', 1),
 ('italian', 1),
 ('milf', 1),
 ('lot', 1),
 ('headach', 1),
 ('safe', 1),
 ('role', 1),
 ('125lb', 1),
 ('ght', 1),
 ('tan', 1),
 ('privat', 1),
 ('8134082387', 1)]'''