In [13]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('max_colwidth', 100)

In [2]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def processdoc(document,stem):
    document = document.lower()
    words = word_tokenize(document)
    #print(words)
    word_final = [word for word in words if word not in stopwords.words('english')]
    
    #stemming and lemmatization
    if stem:
        word_final = [stemmer.stem(word) for word in word_final]
    else:
        word_final = [lemmatizer.lemmatize(word) for word in word_final]
     
    document = " ".join(word_final)
    return document

In [6]:
spam = pd.read_csv("SMSSpamCollection.txt",sep = "\t", names=["label", "message"])
spam.head()



Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [7]:
#taking a dataset of first 50 rows
spam = spam.iloc[0:50:]

In [8]:
messages = [message for message in spam.message]

In [11]:
#Processing the text using processdoc funtion
processed_messages = [processdoc(message,stem=True) for message in messages]
#print(processed_messages)

In [14]:
#Creating bag of words model using tf-idf vectorizer
vectorizer = TfidfVectorizer()
bow_model = vectorizer.fit_transform(processed_messages)
print(bow_model)

  (0, 136)	0.17268973181326394
  (0, 165)	0.2783983565162954
  (0, 233)	0.2783983565162954
  (0, 87)	0.2783983565162954
  (0, 44)	0.2783983565162954
  (0, 53)	0.2783983565162954
  (0, 143)	0.23287212987052175
  (0, 348)	0.2783983565162954
  (0, 169)	0.2783983565162954
  (0, 52)	0.2783983565162954
  (0, 71)	0.2783983565162954
  (0, 140)	0.25176722112918554
  (0, 34)	0.2783983565162954
  (0, 330)	0.2783983565162954
  (1, 220)	0.4017341108428282
  (1, 170)	0.4343305520327375
  (1, 164)	0.4343305520327375
  (1, 339)	0.4802726555443259
  (1, 221)	0.4802726555443259
  (2, 129)	0.15597999112748157
  (2, 114)	0.39799631049869955
  (2, 342)	0.19899815524934977
  (2, 80)	0.19899815524934977
  (2, 340)	0.17996231437533816
  (2, 116)	0.39799631049869955
  :	:
  (46, 216)	0.5117896425850017
  (47, 136)	0.3695791767553826
  (47, 112)	0.5388156049690961
  (47, 37)	0.4670114049941646
  (47, 118)	0.5958098048504885
  (48, 42)	0.31103854183504276
  (48, 355)	0.31103854183504276
  (48, 152)	0.31103854183

In [15]:
df_model = pd.DataFrame(bow_model.toarray(),columns = vectorizer.get_feature_names())

In [16]:
df_model

Unnamed: 0,000,07732584351,08000930705,08002986030,08452810075over18,09061701461,100,11,12,150p,...,worri,www,xuhui,xxx,xxxmobilemovieclub,ye,yeah,yummi,yup,ú1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.198998,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.257878,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.231227,0.0,0.0,0.231227,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.217862,0.0,0.0,0.0,0.217862,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Description
You are given a set of documents in the code below. Calculate the tf-idf matrix and output the score of the term 'belt' in document two.

In [25]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# consider the following set of documents
documents = ["The coach lumbered on again, with heavier wreaths of mist closing round it as it began the descent.",
             "The guard soon replaced his blunderbuss in his arm-chest, and, having looked to the rest of its contents, and having looked to the supplementary pistols that he wore in his belt, looked to a smaller chest beneath his seat, in which there were a few smith's tools, a couple of torches, and a tinder-box.",
            "For he was furnished with that completeness that if the coach-lamps had been blown and stormed out, which did occasionally happen, he had only to shut himself up inside, keep the flint and steel sparks well off the straw, and get a light with tolerable safety and ease (if he were lucky) in five minutes.",
            "Jerry, left alone in the mist and darkness, dismounted meanwhile, not only to ease his spent horse, but to wipe the mud from his face, and shake the wet out of his hat-brim, which might be capable of holding about half a gallon.",
            "After standing with the bridle over his heavily-splashed arm, until the wheels of the mail were no longer within hearing and the night was quite still again, he turned to walk down the hill."]


# preprocess document
def preprocess(document):
    'changes document to lower case, removes stopwords and stems words'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    # stem
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    # join words to make sentence
    document = " ".join(words)
    
    return document

# preprocess documents using the preprocess function and store the documents again in a list
documents = [preprocess(document) for document in documents]

# create tf-idf matrix
## write code here ##
vectorizer = TfidfVectorizer()
bow_model = vectorizer.fit_transform(documents)
bow_df = pd.DataFrame(bow_model.toarray(),columns=vectorizer.get_feature_names())
bow_df
# extract score
score = 0.175006  # replace -1 with the score of 'belt' in document two. You can manually write the value by looking at the tf_idf model

# print the score -- don't change the following piece od code, it's used to evaluate your code
print(round(score, 4))

0.175
