# Information Retrieval: Programming Assignment \#1

### Sheetal Parikh
EN.605.744.81<br>
September 7, 2021
***
***

## Source Code and Notes

In [1]:
#imports for notebook
import os 
import re
import nltk
import os
import string
import numpy as np
import pandas as pd
import re
import math
#import num2words
#pip install num2words

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')

# change the current directory 
# to specified directory 
os.chdir(r"C:\Users\Sheetal\Documents\Sheetal\datasets") 

#checking current directory
#print(os.getcwd() + "\n")

#Read in file
#path1 = 'datasets/headlines.txt'
#path2 = 'datasets/yelp.txt'

In [2]:
#the raw text file includes HTML headings and tags - the function below will remove these tags

def cleanHtmlTags(raw_text):
    clean = re.compile('<.*?>')
    fixedText = re.sub(clean, '', raw_text)
    return fixedText

In [3]:
#this will open/close read the text file, clean the HTML tags, and copy the resulting data into a list

def openFile(textFile):
    content = []
    with open(textFile, "r") as f:
        for line in f:
            text = line.strip()
            cleantext = cleanHtmlTags(text)
            content.append(cleantext)
    return content

In [4]:
# copying the list into a dataframe so that it is easier to use

def createDataFrame(data):
    #removing extra spacing
    data = list(filter(None,data))
    
    #the dictionary column will contain the original wording of each document/paragraph
    df = pd.DataFrame(np.array(data), columns = ['dictionary'])
    return df

In [5]:
#will convert contractions into it's original word combination

def removeContractions(s):
    s = re.sub(r"can\'t", "can not",s)
    s = re.sub(r"couldn't", "can not",s)
    s = re.sub(r"won't", "will not",s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"wouldn't", "would not",s)
    s = re.sub(r"\'d", "would",s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'re", " are", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    s = re.sub(r"n\'t", " not", s)
    s = re.sub(r"\'t", " not", s)
    return s

In [6]:
#def stemming(data):
#    stemmer= PorterStemmer()
#    
#    tokens = word_tokenize(str(data))
#    new_text = ""
#    for w in tokens:
#        new_text = new_text + " " + stemmer.stem(w)
#    return new_text

In [7]:
#preprocessing data - making text all lowercase, removing contractions, removing punctuation, lemmatizing text
## each preprocessing step will add a column to the dataframe showing the updated text
###there is a column showing the text if stop words were removed - this column is not used other than for calculating
####the dictionary percentage in the function below

def preprocess(df):
    #lowercase
    df['fix_lowercase'] = df['dictionary'].apply(lambda x: " ".join(word.lower() for word in x.split()))
    
    #contractions
    df['fix_contractions'] = df['fix_lowercase'].apply(lambda x:removeContractions(x))
    
    #punctuation
    df['fix_punctuation'] = df['fix_contractions'].str.replace('[^\w\s]', '')
    
    #lemmatized
    lemmatizer = WordNetLemmatizer()
    df['lemmatized'] = df['fix_punctuation'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
    
    stop_words = stopwords.words('english')
    #df['num_stopwords'] = df['lemmatized'].apply(lambda x: len([word for word in x.split() if word in stop_words]))
    df['fix_stopwords'] = df['lemmatized'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

    return df
    

In [8]:
#adds the analysis steps as columns to the dataframe
##columns will contain the word count(after preprocessing), number of stopwords, number of unique words, and percentage
###of words in the document that are dictionary words(or unique words divided by the word count)

def analyze(df):
    stop_words = stopwords.words('english')
    
    #column for number of stopwords
    df['num_stopwords'] = df['lemmatized'].apply(lambda x: len([word for word in x.split() if word in stop_words]))
    
    #word count - using the final preprocessed text
    words = df['lemmatized'].str.lower().str.split()
    df['word_count'] = words.apply(len)
    
    #unique word count
    noStopWords = df['fix_stopwords'].str.lower().str.split()
    df['unique_words'] = noStopWords.apply(set).apply(len)
    
    #dictionary percentage or the % of unique words in the document
    percent_column = df['unique_words'] / df['word_count']
    df['dictionary_percentage'] = percent_column
    
    return df

In [9]:
def frequentWords(df):
    #getting a list of the total counts of each word in the preprocessed text - in descending order
    s = pd.Series(" ".join(df['lemmatized']).split()).value_counts()
    df1 = s.to_frame()
    
    #extracting the word from the index
    index = df1.index
    allWords = list(index)

    #extracting the total word count
    counts = df1.iloc[:,0]
    countsList = list(counts)
    
    #getting the document set
    set = df['lemmatized']
    
    #creating a dictionary of the words and word count which is then converted to a dataframe
    dict = {'Word': allWords, 'Word_Count': countsList} 
    df_dict = pd.DataFrame(dict)
    
    #calculating the number of documents each word is found in and saving the word and document count to dictionary DocFreq
    wordsets = [ frozenset(document.split(' ')) for document in set ]
    DocFreq = {}
    for i in range(len(wordsets)):
        tokens = wordsets[i]
        for w in tokens:
            try:
                DocFreq[w].add(i)
            except:
                DocFreq[w] = {i}
    
    for i in DocFreq:
        DocFreq[i] = len(DocFreq[i])
    
    #organizing document frequency output into a dataframe
    df_DocFreq = pd.DataFrame(list(DocFreq.items()), columns=['Word', 'Document_Frequency'])
    df_output = df_DocFreq.sort_values(by='Document_Frequency', ascending = False)
    
    #merging results to the dataframe of total word counts using the word as the id
    df_final = pd.merge(df_output, df_dict, left_on = 'Word', right_on = 'Word', how ='left')
    
    return df_final

In [10]:
#displaying other stats of documents: total words, total unique words, total paragraphs, 500th word, 1000th word,5000th word

def calcStats(df):
    #total paragraphs/dictionaries processed
    total_paragraphs = len(df.index)
    
    #total words after proprocessing
    total_words = df['word_count'].sum()
    
    #total unique words
    total_unique = df['unique_words'].sum()
    
    #500th word
    fivehund = pd.Series(" ".join(df['lemmatized']).split()).value_counts()[499:500].index[0]
    df_500 = df[df['lemmatized'].str.contains(fivehund)]
    
    #1000th word
    thousand = pd.Series(" ".join(df['lemmatized']).split()).value_counts()[999:1000].index[0]
    df_1000 = df[df['lemmatized'].str.contains(thousand)]
    
    #5000th word
    fivethou = pd.Series(" ".join(df['lemmatized']).split()).value_counts()[4999:5000].index[0]
    df_5000 = df[df['lemmatized'].str.contains(fivethou)]
    
    #display results
    print(f'Total Paragraphs: {total_paragraphs}')
    print(f'Total Words: {total_words}')
    print(f'Total Unique Words: {total_unique}')
    print(f'500th Most Frequent Word: {fivehund}, # of times Seen: {len(df_500)}')
    print(f'1000th Most Frequent Word: {thousand}, # of times Seen: {len(df_1000)}')
    print(f'5000th Most Frequent Word: {fivethou}, # of times Seen: {len(df_5000)}')
    

In [11]:
#from IPython.display import display
#def displayMostFrequent(df):
#    with pd.option_context('display.max_rows', 100, 'display.max_columns', None):
#        print("The 100 most frequent words: ")
#        display(df) 

In [12]:
#displaying the word, document frequency and word count filtered by a specific document frequency
def documentFreq(df, x):
    df_mask=df['Document_Frequency']== x
    filtered_df = df[df_mask]
    return filtered_df

### Yelp Results

In [13]:
content = openFile("yelp.txt")

In [14]:
df_yelp = createDataFrame(content)
df_yelp = preprocess(df_yelp)

In [15]:
df_yelp = analyze(df_yelp)
df_yelp

Unnamed: 0,dictionary,fix_lowercase,fix_contractions,fix_punctuation,lemmatized,fix_stopwords,num_stopwords,word_count,unique_words,dictionary_percentage
0,Seen this restaurant on 25 best places in Pitt...,seen this restaurant on 25 best places in pitt...,seen this restaurant on 25 best places in pitt...,seen this restaurant on 25 best places in pitt...,seen this restaurant on 25 best place in pitts...,seen restaurant 25 best place pittsburgh rick ...,32,68,34,0.500000
1,Grew up near here. the family would always go ...,grew up near here. the family would always go ...,grew up near here. the family would always go ...,grew up near here the family would always go o...,grew up near here the family would always go o...,grew near family would always go month stopped...,27,52,23,0.442308
2,I have never seen a restaurant that has a frow...,i have never seen a restaurant that has a frow...,i have never seen a restaurant that has a frow...,i have never seen a restaurant that has a frow...,i have never seen a restaurant that ha a frown...,never seen restaurant ha frowning brownie aka ...,163,338,131,0.387574
3,Stick to basics and this is the best place in ...,stick to basics and this is the best place in ...,stick to basics and this is the best place in ...,stick to basics and this is the best place in ...,stick to basic and this is the best place in o...,stick basic best place around burgh first time...,73,156,73,0.467949
4,I like the food at Denny's more than meals ser...,i like the food at denny's more than meals ser...,i like the food at denny is more than meals se...,i like the food at denny is more than meals se...,i like the food at denny is more than meal ser...,like food denny meal served pricier restaurant...,35,75,37,0.493333
...,...,...,...,...,...,...,...,...,...,...
8887,"Well, it's clear they are newly opened and the...","well, it's clear they are newly opened and the...","well, it is clear they are newly opened and th...",well it is clear they are newly opened and the...,well it is clear they are newly opened and the...,well clear newly opened staff working figure 2...,85,164,59,0.359756
8888,This place sucks! Wtf is this?! A fish Taco th...,this place sucks! wtf is this?! a fish taco th...,this place sucks! wtf is this?! a fish taco th...,this place sucks wtf is this a fish taco that ...,this place suck wtf is this a fish taco that i...,place suck wtf fish taco solely fish tortilla ...,133,246,82,0.333333
8889,I hope this place is just experiencing some hi...,i hope this place is just experiencing some hi...,i hope this place is just experiencing some hi...,i hope this place is just experiencing some hi...,i hope this place is just experiencing some hi...,hope place experiencing hiccup since openedbut...,51,121,59,0.487603
8890,Meh. I was less than impressed by my meal here...,meh. i was less than impressed by my meal here...,meh. i was less than impressed by my meal here...,meh i was less than impressed by my meal here ...,meh i wa le than impressed by my meal here it ...,meh wa le impressed meal cute concept sure goi...,61,123,53,0.430894


The table above summarizes much of the data that will be used for the calculations below.  The dictionary column is the original text and the subsequent 5 columns display the text changes after applying the normalization technique in the title. The lemmatized text was used for the calculations below.

In [16]:
calcStats(df_yelp)

Total Paragraphs: 8892
Total Words: 1286762
Total Unique Words: 541021
500th Most Frequent Word: girl, # of times Seen: 401
1000th Most Frequent Word: view, # of times Seen: 1108
5000th Most Frequent Word: remotely, # of times Seen: 10


In [17]:
df_yelp_freq = frequentWords(df_yelp)
#df_yelp_freq

In [18]:
df_yelp_100 = df_yelp_freq.head(100)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print("Top 100 Most Frequent Words: ")
    print(df_yelp_100)

Top 100 Most Frequent Words: 
          Word  Document_Frequency  Word_Count
0          the                8530       65257
1          and                8273       41324
2            a                8045       39029
3            i                7631       37811
4           to                7425       27964
5           is                7104       25401
6           it                6884       22204
7          not                6543       17225
8           of                6501       19543
9          for                6272       14782
10          wa                6043       23347
11          in                5966       13861
12         but                5559       10988
13        this                5389       10146
14        have                5208       10295
15        that                5081       11955
16        with                4983       10426
17          my                4871       10234
18        food                4811        8062
19          on                

In [19]:
df_yelp_freq

Unnamed: 0,Word,Document_Frequency,Word_Count
0,the,8530,65257
1,and,8273,41324
2,a,8045,39029
3,i,7631,37811
4,to,7425,27964
...,...,...,...
34358,thingbecause,1,1
34359,overhang,1,2
34360,policing,1,1
34361,experimented,1,1


In [20]:
filtered_df = documentFreq(df_yelp_freq, 1)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(f'Number of words with a document frequency of one: {len(filtered_df)}')
    print("")
    print(f' 100 words with a document frequency of one: ')
    print(filtered_df.head(100))

Number of words with a document frequency of one: 19716

 100 words with a document frequency of one: 
                     Word  Document_Frequency  Word_Count
14647                1027                   1           1
14648             babelon                   1           3
14649          epitomizes                   1           1
14650         loudspeaker                   1           1
14651            moralize                   1           1
14652        amphitheater                   1           1
14653               yimmy                   1           1
14654             willies                   1           1
14655         implication                   1           1
14656            breathes                   1           1
14657             ruglach                   1           1
14658             phyllis                   1           1
14659              noches                   1           1
14660               hidef                   1           1
14661       problemschewy  

### Headlines Results

In [21]:
content_2 = openFile("headlines.txt")

In [22]:
df_headlines = createDataFrame(content_2)
df_headlines = preprocess(df_headlines)

In [23]:
df_headlines = analyze(df_headlines)
df_headlines

Unnamed: 0,dictionary,fix_lowercase,fix_contractions,fix_punctuation,lemmatized,fix_stopwords,num_stopwords,word_count,unique_words,dictionary_percentage
0,Worcester breakfast club for veterans gives hu...,worcester breakfast club for veterans gives hu...,worcester breakfast club for veterans gives hu...,worcester breakfast club for veterans gives hu...,worcester breakfast club for veteran give hung...,worcester breakfast club veteran give hunger m...,2,10,8,0.800000
1,Jumpshot Gives Marketers Renewed Visibility In...,jumpshot gives marketers renewed visibility in...,jumpshot gives marketers renewed visibility in...,jumpshot gives marketers renewed visibility in...,jumpshot give marketer renewed visibility into...,jumpshot give marketer renewed visibility paid...,4,15,10,0.666667
2,This New Dating App Will Ruin Your Internet Game,this new dating app will ruin your internet game,this new dating app will ruin your internet game,this new dating app will ruin your internet game,this new dating app will ruin your internet game,new dating app ruin internet game,3,9,6,0.666667
3,Pay up or face legal action: DBKL,pay up or face legal action: dbkl,pay up or face legal action: dbkl,pay up or face legal action dbkl,pay up or face legal action dbkl,pay face legal action dbkl,2,7,5,0.714286
4,"Euro up; USD, Pound and Yen down","euro up; usd, pound and yen down","euro up; usd, pound and yen down",euro up usd pound and yen down,euro up usd pound and yen down,euro usd pound yen,3,7,4,0.571429
...,...,...,...,...,...,...,...,...,...,...
499995,YWP: Apples,ywp: apples,ywp: apples,ywp apples,ywp apple,ywp apple,0,2,2,1.000000
499996,Microsoft Corporation's Latest Android App Cou...,microsoft corporation's latest android app cou...,microsoft corporation is latest android app co...,microsoft corporation is latest android app co...,microsoft corporation is latest android app co...,microsoft corporation latest android app could...,4,11,7,0.636364
499997,Crumbs! Use your loaf and save dough by recycl...,crumbs! use your loaf and save dough by recycl...,crumbs! use your loaf and save dough by recycl...,crumbs use your loaf and save dough by recycli...,crumb use your loaf and save dough by recyclin...,crumb use loaf save dough recycling bread say ...,3,13,10,0.769231
499998,New Living Wage 'to benefit one-in-three worki...,new living wage 'to benefit one-in-three worki...,new living wage noto benefit one-in-three wor...,new living wage noto benefit oneinthree worki...,new living wage noto benefit oneinthree workin...,new living wage noto benefit oneinthree workin...,0,8,8,1.000000


The table above summarizes much of the data that will be used for the calculations below. The dictionary column is the original text and the subsequent 5 columns display the text changes after applying the normalization technique in the title. The lemmatized text was used for the calculations below.

In [24]:
calcStats(df_headlines)

Total Paragraphs: 500000
Total Words: 4587539
Total Unique Words: 3512166
500th Most Frequent Word: agreement, # of times Seen: 1320
1000th Most Frequent Word: grow, # of times Seen: 5295
5000th Most Frequent Word: surrender, # of times Seen: 127


In [25]:
df_head_freq = frequentWords(df_headlines)

In [26]:
df_head_100 = df_head_freq.head(100)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print("Top 100 Most Frequent Words: ")
    print(df_head_100)

Top 100 Most Frequent Words: 
         Word  Document_Frequency  Word_Count
0          to              109786    118824.0
1          in               83834     88373.0
2         the               72269     83558.0
3          of               70060     76129.0
4         for               65773     67595.0
5          is               56690     61706.0
6         and               51216     55417.0
7           a               49498     54744.0
8          on               40822     41841.0
9        with               31320     31812.0
10         at               31141     31640.0
11        new               26321     26830.0
12       2015               20555     20903.0
13         by               16766     17339.0
14       from               16021     16160.0
15      after               12922     12967.0
16          u               11875     12001.0
17         it               11673     12132.0
18     market               11374     13187.0
19        not               10430     10635.0
20  

In [27]:
filtered_df2 = documentFreq(df_head_freq, 1)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(f'Number of words with a document frequency of one: {len(filtered_df2)}')
    print("")
    print(f' 100 words with a document frequency of one: ')
    print(filtered_df2.head(100))

Number of words with a document frequency of one: 85192

 100 words with a document frequency of one: 
                      Word  Document_Frequency  Word_Count
80310            prajapati                   1         1.0
80311              dinos22                   1         1.0
80312              rmb400m                   1         1.0
80313            clamoring                   1         1.0
80314              reporoa                   1         1.0
80315             brazoria                   1         1.0
80316                  x5r                   1         1.0
80317                 terk                   1         1.0
80318              kilcock                   1         1.0
80319             firstgen                   1         1.0
80320            pulsatile                   1         1.0
80321             soccerfa                   1         1.0
80322            walkabout                   1         1.0
80323              macnica                   1         1.0
80324       

### Summary and Write-Up

(a) Describe how you normalized the text and determined what a word is

The text was normalized first making all the words lowercase, substituting the contractions for the full word group, removing any punctuations and lemmatizing the text using the NLTK WordNetLemmatizer. Multiple columns of preprocessed data output displays how the text changes with each normalization step.  The remaining text was tokenized and each token was considered a word. 

Additional methods could have been stemming as well as substituting numbers for the written words.  
I attempted stemming using the PorterStemmmer however it resulting in overstemming where much of the words were cut off. Although stop words were not removed for this assignment, a column of the text after removing the stopwords were displayed.  Stop words could have been removed be as well as other very common words such as "when", "get", or "with" to allow for more important words to be in our dictionary. 

(b) summarize any similarities and differences in the top-100 terms from Yelp and Headlines

Similarities:
<br>
Since stopwords weren't removed, both datasets had stopwords such as "the", "and", "to", as well as others in the top 100 most frequent words. Most of the words included in the top 100 terms also were words of a shorter length.  Also, both lists of words included terms that were not complete words such as "ha" in Headlines and "col" in in Yelp.

Differences:
<br>
The top words in Yelp appear to be more food related such as "food", "restaurant", "ordered", "chicken", as well as other words. Also the Yelp terms, since they are from reviews, are more descriptive words.  However, the top words in Headlines include more numbers and more nouns.