In [1]:
# importing libraries

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.corpus import inaugural

### Importing Data

The data being used is the Inaugural Corpus, specifically the inaugural speeches of Obama (2009) and Trump (2017). The choice for this dataset was the possibility of getting similar words concerning national affairs.

In [2]:
obama = inaugural.words(fileids='2009-Obama.txt')
trump = inaugural.words(fileids='2017-Trump.txt')

X =" ".join(obama)
Y =" ".join(trump)

X = X.lower()
Y = Y.lower()

### Calculate Cosine Similarity

The method defined below performs the following functions: 
- Tokenise the words
- Remove stopwords
- Remove duplicate words
- Vectorise the words
- Calculate the Cosine Similarity

In [3]:
def processWords(X,Y): 
    X_list = word_tokenize(X)  
    Y_list = word_tokenize(Y)
     
    sw = stopwords.words('english')  
    l1 =[]; l2 =[]
    
    # removing stop words
    X_set = {w for w in X_list if not w in sw}  
    Y_set = {w for w in Y_list if not w in sw} 

    # removing duplicate words
    rvector = X_set.union(Y_set)  
    
    # Vectorise
    for w in rvector: 
        if w in X_set: l1.append(1)
        else: l1.append(0) 
        if w in Y_set: l2.append(1) 
        else: l2.append(0)
    c = 0
    
    # cosine formula  
    for i in range(len(rvector)): 
            c+= l1[i]*l2[i] 
    cosine = c / float((sum(l1)*sum(l2))**0.5) 
    print("similarity: ", cosine)

In [4]:
processWords(X,Y)

similarity:  0.279620880680984


### Stemming Method

The stemming method splits the given string, and applies the Lancaster Stemmer to each word of the string.

In [5]:
def stemmingProc(X,Y):
    from nltk.stem import LancasterStemmer
    lanc=LancasterStemmer()
    X_ans=""
    Y_ans=""
    for i in X.split(" "):
        X_ans+=lanc.stem(i)+" "
    for i in Y.split(" "):
        Y_ans+=lanc.stem(i)+" "
    return X_ans,Y_ans

In [6]:
X_ans,Y_ans=stemmingProc(X,Y)

In [7]:
processWords(X_ans,Y_ans)

similarity:  0.38593973808344356


## Inference

The cosine similarity of the documents **increase** when the stemming algorithm is applied before calculating the cosine similarity.

- Similarity before stemming  : 0.2796
- Similarity after stemming   : 0.3859