# Solutions to Checkpoint 5

## 1. Converting words or sentences into numeric vectors is fundamental when working with text data. To make sure you are solid on how these vectors work, please generate the tf-idf vectors for the last three sentences of the example we gave at the beginning of this checkpoint. If you are feeling uncertain, have your mentor walk you through it.

The last three sentences:
- 4 "The Lumberjack Song is the funniest Monty Python bit: I can't think of it without laughing."
- 5 "I would rather put strawberries on my ice cream for dessert, they have the best taste."
- 6 "The taste of caramel is a fantastic accompaniment to tasty mint ice cream."

In [1]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
import warnings
warnings.filterwarnings("ignore")

In [2]:
sentence_4 = "The Lumberjack Song is the funniest Monty Python bit: I can't think of it without laughing"
sentence_5 = "I would rather put strawberries on my ice cream for dessert, they have the best taste."
sentence_6 = "The taste of caramel is a fantastic accompaniment to tasty mint ice cream."

In [3]:
bagOfWords4 = sentence_4.split(' ')
bagOfWords5 = sentence_5.split(' ')
bagOfWords6 = sentence_6.split(' ')

#### From Scratch

In [4]:
##Remove unique words by casting it to a set
uniqueWords = set(bagOfWords4).union(set(bagOfWords5)).union(set(bagOfWords6))

In [5]:
numOfWords4 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords4:
    numOfWords4[word] += 1
numOfWords5 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords5:
    numOfWords5[word] += 1
numOfWords6 = dict.fromkeys(uniqueWords, 0)
for word in bagOfWords6:
    numOfWords6[word] += 1

In [6]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [7]:
tf4 = computeTF(numOfWords4, bagOfWords4)
tf5 = computeTF(numOfWords5, bagOfWords5)
tf6 = computeTF(numOfWords6, bagOfWords6)

In [8]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [9]:
idfs = computeIDF([numOfWords4, numOfWords5, numOfWords6])

In [10]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [11]:
tfidf4 = computeTFIDF(tf4, idfs)
tfidf5 = computeTFIDF(tf5, idfs)
tfidf6 = computeTFIDF(tf6, idfs)
df = pd.DataFrame([tfidf4, tfidf5, tfidf6])

In [12]:
df

Unnamed: 0,laughing,is,for,taste.,fantastic,Lumberjack,on,they,bit:,cream.,...,strawberries,funniest,accompaniment,it,my,tasty,of,best,taste,The
0,0.068663,0.025342,0.0,0.0,0.0,0.068663,0.0,0.0,0.068663,0.0,...,0.0,0.068663,0.0,0.068663,0.0,0.0,0.025342,0.0,0.0,0.025342
1,0.0,0.0,0.068663,0.068663,0.0,0.0,0.068663,0.068663,0.0,0.0,...,0.068663,0.0,0.0,0.0,0.068663,0.0,0.0,0.068663,0.0,0.0
2,0.0,0.03119,0.0,0.0,0.084509,0.0,0.0,0.0,0.0,0.084509,...,0.0,0.0,0.084509,0.0,0.0,0.084509,0.03119,0.0,0.084509,0.03119


#### using the vectorizer:

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([sentence_4, sentence_5, sentence_6])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [15]:
df

Unnamed: 0,accompaniment,best,bit,can,caramel,cream,dessert,fantastic,for,funniest,...,song,strawberries,taste,tasty,the,they,think,to,without,would
0,0.0,0.0,0.271642,0.271642,0.0,0.0,0.0,0.0,0.0,0.271642,...,0.271642,0.0,0.0,0.0,0.320872,0.0,0.271642,0.0,0.271642,0.0
1,0.0,0.276458,0.0,0.0,0.0,0.210254,0.276458,0.0,0.276458,0.0,...,0.0,0.276458,0.210254,0.0,0.163281,0.276458,0.0,0.0,0.0,0.276458
2,0.328961,0.0,0.0,0.0,0.328961,0.250183,0.0,0.328961,0.0,0.0,...,0.0,0.0,0.250183,0.328961,0.19429,0.0,0.0,0.328961,0.0,0.0


* 4: 1.585, 1, 0, 1, 1.585, 0,0,0,0
* 5: 0,0,0,0,0, .585, 1, 1.585, 1
* 6: 0,0,0,0,0,0, 1, 0, 2

In [16]:
# parse the cleaned novels. this can take a bit
nlp = spacy.load('en')
sentence4_doc = nlp(sentence_4)
sentence5_doc = nlp(sentence_5)
sentence6_doc = nlp(sentence_6)

In [17]:
nlp = spacy.load('en')
sentence4_doc = nlp(sentence_4)

In [18]:
sentences = []
for token in sentence4_doc:
    if not token.is_punct:
        sentences.append(token.text)

In [19]:
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer(stop_words='english')


#Applying the vectorizer
sentences_tfidf=vectorizer.fit_transform(sentences)
print("Number of features: %d" % sentences_tfidf.get_shape()[1])

# #splitting into training and test sets
# X_train_tfidf, X_test_tfidf= train_test_split(sentences_tfidf, test_size=0.4, random_state=0)

#Reshapes the vectorizer output into something people can read
sentences_tfidf_csr = sentences_tfidf.tocsr()

#number of paragraphs
n = sentences_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*sentences_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = sentences_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Tf_idf vector:', tfidf_bypara)


Number of features: 9
Tf_idf vector: [{}, {'lumberjack': 1.0}, {'song': 1.0}, {}, {}, {'funniest': 1.0}, {'monty': 1.0}, {'python': 1.0}, {'bit': 1.0}, {}, {'ca': 1.0}, {}, {'think': 1.0}, {}, {}, {}, {'laughing': 1.0}]


In [20]:
corpus = [
    "The Lumberjack Song is the funniest Monty Python bit: I can't think of it without laughing.",
     "I would rather put strawberries on my ice cream for dessert, they have the best taste.",
     "The taste of caramel is a fantastic accompaniment to tasty mint ice cream."
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.shape)


['accompaniment', 'best', 'bit', 'can', 'caramel', 'cream', 'dessert', 'fantastic', 'for', 'funniest', 'have', 'ice', 'is', 'it', 'laughing', 'lumberjack', 'mint', 'monty', 'my', 'of', 'on', 'put', 'python', 'rather', 'song', 'strawberries', 'taste', 'tasty', 'the', 'they', 'think', 'to', 'without', 'would']
(3, 34)


## 2. In the 2-grams example above, we only used 2-grams as our features. This time, use both 1-grams and 2-grams together as your feature set. Run the same models in the example and compare the results.

In [21]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
import warnings
warnings.filterwarnings("ignore")

nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/sajithgowthaman/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/sajithgowthaman/opt/anaconda3/lib/python3.7/site-packages/en_core_web_sm
-->
/Users/sajithgowthaman/opt/anaconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [22]:
# utility function for standard text cleaning
def text_cleaner(text):
    # visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [23]:
# load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# the chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [24]:
# parse the cleaned novels. this can take a bit
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [25]:
# group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# combine the sentences from the two novels into one data frame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [26]:
# get rid off stop words and punctuation
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = " ".join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop])

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_df=0.5, min_df=2, use_idf=True, norm=u'l2', smooth_idf=True, ngram_range=(1,2))


# applying the vectorizer
X = vectorizer.fit_transform(sentences["text"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([tfidf_df, sentences[["text", "author"]]], axis=1)

# keep in mind that the log base 2 of 1 is 0,
# so a tf-idf score of 0 indicates that the word was present once in that sentence.
sentences.head()

Unnamed: 0,abide,ability,able,able bear,able persuade,abominate,abroad,absence,absence home,absent,...,young people,young person,young sister,young woman,youth,youth say,zeal,zealous,text,author
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Alice begin tired sit sister bank have twice p...,Carroll
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,remarkable Alice think way hear Rabbit,Carroll
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,oh dear,Carroll
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,oh dear,Carroll


In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9014162732574285

Test set score: 0.870887130362349
----------------------Random Forest Scores----------------------
Training set score: 0.964454318244932

Test set score: 0.870887130362349
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8625381838378229

Test set score: 0.8513119533527697


As can be seen above, using 1-gram along with 2-gram improved the performances of all of the models.