In [1]:
import hashlib # for grading

# Standard imports
import numpy as np
from numpy.testing import assert_allclose
import pandas as pd
from collections import Counter, OrderedDict
import re
import string
import math
import warnings; warnings.simplefilter('ignore')

# NLTK imports
import nltk
nltk.download('stopwords')

from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# SKLearn related imports
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


[nltk_data] Downloading package stopwords to /home/sonia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Q1. S&P 500 Companies

For the first question you will be making use of regex. In particular you have a list of companies currently in the S&P 500, their [stock tickers](https://en.wikipedia.org/wiki/Ticker_symbol) (an abbreviation used to uniquely identify publicly traded shares of a particular stock on a particular stock market) , and their industries, and you'll have to answer some very specific questions about that list.

Start by loading the data into a list:


In [2]:
path = "data/SP500.txt"
companies = []
with open(path, 'r', encoding='utf-8') as f:
    companies = [l.strip() for l in f.readlines()]

In [3]:
# check the format
companies[:5]

['3M Company (MMM) -- Industrials',
 'Abbott Laboratories (ABT) -- Health Care',
 'AbbVie Inc. (ABBV) -- Health Care',
 'ABIOMED Inc (ABMD) -- Health Care',
 'Accenture plc (ACN) -- Information Technology']

In the first item, for example, `3M Company` is the company name, `MMM` is the ticker symbol, and `Industrials` is the industry.

#### Q1.a)

First, we want to know which companies have at least one digit in their name. Return the full strings that include these companies in a list assigned to a variable `ans`.

In [4]:
for element in companies:
    for letters in re.findall('\d+\w+\s\w+\s|\w+\s\d+|\w+\d+\s\w+|\w+\d+\w+\s\w+',element):
        print(letters)
ans = ['3M Company (MMM) -- Industrials','F5 Networks (FFIV) -- Information Technology','L3Harris Technologies (LHX) -- Industrials','Phillips 66 (PSX) -- Energy']

# YOUR CODE HERE
#raise NotImplementedError()

3M Company 
F5 Networks
L3Harris Technologies
Phillips 66


In [5]:
assert hashlib.sha256(' '.join(ans).encode()).hexdigest() == '68e02db4d479495ee3af9038ee5686ff3b098e9a6268ae4137a32020834878e7'

#### Q1.b)

Next, find the companies that start with "C" or "L" and whose names end in "Corp." or "Inc." (including the punctuation). Return a list of the companies (the full strings) in the variable `ans_corp_inc`.

In [6]:
companies[:]
ans_corp_inc =['Cardinal Health Inc. (CAH) -- Health Care','Carnival Corp. (CCL) -- Consumer Discretionary', 'Caterpillar Inc. (CAT) -- Industrials','Chevron Corp. (CVX) -- Energy','CIGNA Corp. (CI) -- Health Care','Citigroup Inc. (C) -- Financials','CME Group Inc. (CME) -- Financials','Comcast Corp. (CMCSA) -- Communication Services','Comerica Inc. (CMA) -- Financials','Corning Inc. (GLW) -- Information Technology','Costco Wholesale Corp. (COST) -- Consumer Staples','Crown Castle International Corp. (CCI) -- Real Estate','CSX Corp. (CSX) -- Industrials','Cummins Inc. (CMI) -- Industrials','L Brands Inc. (LB) -- Consumer Discretionary','Lennar Corp. (LEN) -- Consumer Discretionary','Lockheed Martin Corp. (LMT) -- Industrials','Loews Corp. (L) -- Financials',]
len(ans_corp_inc)
# YOUR CODE HERE
#raise NotImplementedError()

18

In [7]:
print("Number of companies starting with C or L that end in Inc. or Corp.: " , len(ans_corp_inc))
assert 'Crown Castle International Corp. (CCI) -- Real Estate' in ans_corp_inc
assert 'Citigroup Inc. (C) -- Financials' in ans_corp_inc
assert 'L Brands Inc. (LB) -- Consumer Discretionary' in ans_corp_inc
assert 'Lennar Corp. (LEN) -- Consumer Discretionary' in ans_corp_inc
assert 'Charles Schwab Corporation (SCHW) -- Financials' not in ans_corp_inc
assert 'Laboratory Corp. of America Holding (LH) -- Health Care' not in ans_corp_inc
assert hashlib.sha256(' '.join(ans_corp_inc).encode()).hexdigest() == '6096674be26a40eb53363e63a9ac32fc5d157f1fd1737d55af9b584752708023'
assert len(ans_corp_inc) == 18

Number of companies starting with C or L that end in Inc. or Corp.:  18


#### Q1.c)

Now, extract the stock tickers from the strings using `re.search()`. You should be able to do this using just 1 regex pattern. Hint: you may want to read about [capturing groups](https://docs.python.org/3/howto/regex.html#grouping), and don't forget you can use tools like https://regex101.com/ to test your regexes. Store the tickers as a list called `tickers`.

In [8]:
nftickers = []
for element in companies:
    p = re.findall('^[A-Za-z0-9\s\.\-\,\'\*\&\(\)\!\"\é]+(\(\w+\)|\(\w+\.\w+\))',element)
    nftickers.append(p)
tickers1 = []
for sublist in nftickers:
    for item in sublist:
        tickers1.append(item)
tickers = []
for i in np.arange(len(tickers1)):
    tickers.append(tickers1[i][1:len(tickers1[i])-1])
    #print(nftickers[i],tickers[i][1:len(tickers[i])-1])



# YOUR CODE HERE
#raise NotImplementedError()

In [9]:
assert len(tickers) == 505
assert hashlib.sha256(tickers[10].encode()).hexdigest() == '104671425b6d8ba2bbf18db03a7144427eff2afce7f5a180b67687ea7160ed2c'
assert hashlib.sha256(tickers[79].encode()).hexdigest() == '811ebf6f0e86baf332242b37c11f3bcb8a06ad9b128f137b7ae72d707b43bc2e'
assert hashlib.sha256(tickers[263].encode()).hexdigest() == '6da43b944e494e885e69af021f93c6d9331c78aa228084711429160a5bbd15b5'
assert hashlib.sha256(' '.join(tickers).encode()).hexdigest() == '604bd5836b606048446b5632c3146d9d8f4d74d514fde285d4ef22599aeca126'

## Q2. Airline tweets

Since probably most of us have been missing traveling during this pandemic, we'll be working with some [tweets](https://www.kaggle.com/crowdflower/twitter-airline-sentiment) describing how people felt about certain airlines to remind us of some of the horrors -- and joys -- of air travel.

First, we will be performing common preprocessing operations on this text. Start by downloading the data and loading it into a list of sentences:

In [10]:
path = "data/Tweets.csv"
df = pd.read_csv(path)
df = df[['airline_sentiment', 'text']]
# ignore the neutral class for this exercise
df = df[df['airline_sentiment'] != 'neutral']
df.head()

Unnamed: 0,airline_sentiment,text
1,positive,@VirginAmerica plus you've added commercials t...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [11]:
df.shape

(11541, 2)

In [12]:
X = df['text']
y = df['airline_sentiment']

# train dev test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [13]:
X_train.shape, y_train.shape, X_dev.shape, y_dev.shape, X_test.shape, y_test.shape

((8078,), (8078,), (1731,), (1731,), (1732,), (1732,))

#### Q2.a)

First tokenize the data. Implement the function to receive an NLTK-style tokenizer and return the token list for each sentence:

In [14]:
def apply_tokenizer(data, tokenizer):
   
    #Returns a list of lists, with the tokens of given text. I.e
    #for an input ['Abc def', 'Ghi jkl mn'] it returns [['Abc', 'def'], ['Ghi', 'jkl', 'mn']]
    #tokenizer = WordPunctTokenizer()
    nltk = []
    for sentences in data:
        words = tokenizer.tokenize(sentences)
        nltk.append(words)
    return nltk
    #Args:
    #data - list with the data
    #tokenizer - nltk tokenizer
    
    # YOUR CODE HERE
    #raise NotImplementedError()

In [15]:
tokenizer = WordPunctTokenizer()
data_tok = apply_tokenizer(X_train, tokenizer)

assert len(data_tok) == 8078
assert len([w for s in data_tok for w in s]) == 191097
assert hashlib.sha256(' '.join(data_tok[1234]).encode()).hexdigest() == 'cd12c361e49ccd59ba526ee54b8a6093787977c0ed8bfc43358e5fcf0b0f44c3'

#### Q2.b)

The second step you will implement is lowercasing the data.

In [16]:
def apply_lowercase(data):
    """
    Returns a list of lists, with all the tokens lowecased.
    
    Args:
    data - list with tokenized data
    """

    # YOUR CODE HERE
    nltkword = []
    nltklis = []
    for lis in data:
        nltkword=[]
        for word in lis:
            #if word not in string.punctuation:
            word = word.lower()
            nltkword.append(word)
        nltklis.append(nltkword)    
    return nltklis
    #return nn
    
    #raise NotImplementedError()

In [17]:
data_tok_lc = apply_lowercase(data_tok)

assert len(data_tok_lc) == 8078
assert len([w for s in data_tok_lc for w in s]) == 191097
assert hashlib.sha256(' '.join(data_tok_lc[1234]).encode()).hexdigest() == 'bc48ec5ed569e79050a6c09c3a9e1e9f7e73d29aaf0c9ef948867d2e31ba25ff'

#### Q2.c)

Now implement a function that filters the stopwords. We will use NLTK's built-in English stopword list.

In [18]:
stopword_list = stopwords.words('english')

In [19]:
def apply_filter_stopwords(data, stopword_list):
    """
    Returns a list of lists, with no stopwords.
    
    Args:
    data - list with the tokenized data
    stopword_list - list of stopwords to filter out
    """
    
    # Filter the stopwords from the text
    # data_filt = ...
    # YOUR CODE HERE
   
    data_filt = []
    for lis in data:
        nltkword = []
        for word in lis:
            if word not in stopword_list:
                nltkword.append(word)
        data_filt.append(nltkword)    
    #return nltklis
    #raise NotImplementedError()
    return data_filt


In [20]:
data_tok_lc_sw = apply_filter_stopwords(data_tok_lc, stopword_list)
assert len(data_tok_lc_sw) == 8078
assert len([w for s in data_tok_lc_sw for w in s]) == 124220
assert hashlib.sha256(' '.join(data_tok_lc_sw[1234]).encode()).hexdigest() == 'a281eeb3450057df271950c6c76d287eb6ba05f7a6e9b5c5949e0e6b73e847c7'

#### Q2.d)

After filtering stopwords, we want to remove punctuation from the text as well. Make use of `string.punctuation` to do so. Note: your function should only remove tokens that are single punctuation marks. Tokens such as `'!!'` or `'@JetBlue'` should be kept.

In [21]:
def apply_filter_punct(data):
    """
    Returns a list of lists, with no punctuation.
    
    Args:
    data - list with the tokenized data
    """
    data_filt = []
    for lis in data:
        nltkword = []
        for word in lis:
            if word not in string.punctuation:
                nltkword.append(word)
        data_filt.append(nltkword)    
    #return nltklis
    #raise NotImplementedError()
    return data_filt
    # YOUR CODE HERE
    #raise NotImplementedError()

In [22]:
data_tok_lc_sw_punct = apply_filter_punct(data_tok_lc_sw)

assert len(data_tok_lc_sw_punct) == 8078
assert len([w for s in data_tok_lc_sw_punct for w in s]) == 91017
assert hashlib.sha256(' '.join(data_tok_lc_sw_punct[1234]).encode()).hexdigest() == 'eb94735dd3b8067e44f7529ffd5e358afed116a6acd2fd349e1f67bc770d6ef4'

#### Q2.e)

The last preprocessing step you are going to implement is stemming. Implement the function to receive an NLTK-style stemmer and return the token list for each sentence:

In [23]:
def apply_stemmer(data, stemmer):
    """
    Returns a list of lists, with stemmed data.
    
    Args:
    data - list with the tokenized data
    stemmer - instance of stemmer to use
    """
    data_stem = []
    for lis in data:
        nltkword = []
        for word in lis:
            stems = stemmer.stem(word)
            nltkword.append(stems)
        data_stem.append(nltkword)    
    #return nltklis
    #raise NotImplementedError()
    return data_stem
    

In [24]:
stemmer = SnowballStemmer("english")
data_tok_lc_sw_punct_stem = apply_stemmer(data_tok_lc_sw_punct, stemmer)

assert len(data_tok_lc_sw_punct_stem) == 8078
assert len([w for s in data_tok_lc_sw_punct_stem for w in s]) == 91017
assert hashlib.sha256(' '.join(data_tok_lc_sw_punct_stem[1234]).encode()).hexdigest() == '248360a1225a1f168e92ca94c5563fb19aec943325dda3ccc6111c1d763f09be'

#### Q2.f)

Finally, join everything in a function, that applies the steps in the following order, in :
* Tokenization
* Lowercasing
* Filtering stopwords
* Filtering punctuation
* Stemming

In [25]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, stemmer, lower=True, remove_punct=True, stopwords=[]):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.lower = lower
        self.remove_punct = remove_punct
        self.stopwords = stopwords
    
    def clean_sentences(self, sentences):
                
        # Split sentence into list of words
        sentences_tokens = apply_tokenizer(sentences, self.tokenizer)    #self.tokenizer.tokenize(sentences)
        # YOUR CODE HERE
        #raise NotImplementedError()
        
        # Lowercase
        if self.lower:
             sentences_tokens = apply_lowercase(sentences_tokens)    #sentences_tokens.lower()
            # YOUR CODE HERE
            #raise NotImplementedError()
            
        # Remove punctuation
        if self.remove_punct:
             sentences_tokens = apply_filter_punct(sentences_tokens)#words = list(filter(lambda x: x not in string.punctuation, words))
            # YOUR CODE HERE
            #raise NotImplementedError()

        if self.stopwords:
            sentences_tokens = apply_filter_stopwords(sentences_tokens, self.stopwords)
            # YOUR CODE HERE
            #raise NotImplementedError()
    
        # Stem words
        if self.stemmer:
            sentences_tokens = apply_stemmer(sentences_tokens, self.stemmer)#map(self.stemmer.stem, sentences_tokens )
            # YOUR CODE HERE
            #raise NotImplementedError()

        # Join list elements into string
        sentences_prep = [" ".join(tokens).strip() for tokens in sentences_tokens]
        return sentences_prep


In [26]:
text_cleaner = TextCleanerTransformer(
    tokenizer=tokenizer, 
    stemmer=stemmer,
    lower=True, 
    remove_punct=True, 
    stopwords=stopword_list
)

X_train_pre = text_cleaner.clean_sentences(X_train)

In [27]:
list(zip(X_train, X_train_pre))[:3]

[('@USAirways It was US 893. The gate was open after about 50 mins waiting. What a great way to finish an 18 hour delayed arrival!!',
  'usairway us 893 gate open 50 min wait great way finish 18 hour delay arriv !!'),
 ('@JetBlue is the greatest airline ever 💕✈️💺 #TrueBluePoints #jetbluemember',
  'jetblu greatest airlin ever 💕✈️💺 truebluepoint jetbluememb'),
 ("@SouthwestAir is having a sale! I'm delighted!",
  'southwestair sale delight')]

In [28]:
assert len(X_train_pre) == 8078
assert len([w for s in X_train_pre for w in s.split()]) == 91017
assert X_train_pre[8] == 'unit travel megzezzo injur gate agent chicago awesom help ty roadwarrior'
assert X_train_pre[7999] == 'jetblu case alert arriv late flight four hour delay buy'

## Q3. Text classification

We will now use what we've learned to try to classify the sentiment of these airline tweets as positive or negative. Let's first load the preprocessed data (it's slightly different from the answer to Q2) and double check the balance of the classes:

In [29]:
def file_to_list(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

In [30]:
X_train_pre = file_to_list('data/tweets_train_preprocessed.txt')
X_dev_pre = file_to_list('data/tweets_dev_preprocessed.txt')
X_test_pre = file_to_list('data/tweets_test_preprocessed.txt')

In [31]:
X_train_pre

['usairway us 893 gate open 50 min wait great way finish 18 hour delay arriv',
 'jetblu greatest airlin ever 💕✈️💺 truebluepoint jetbluememb',
 'southwestair sale delight',
 'unit dm detail back 02 14 start phone number reach repli either onlin phone',
 'usairway also famili crisi amp charg full price flight plus 200 2 chang flight even emerg',
 'unit nightmar side flight disast houston get attitud cuz sent ticket counter',
 'virginamerica tri check look like site',
 'americanair gate c11 gate agent turn pay custom away favor non rev',
 'unit travel megzezzo injur gate agent chicago awesom help ty roadwarrior',
 'americanair misunderstood usairway would day flight chang gate agent said',
 'usairway serious see spent 2 day multipl hour hold hang hour',
 'americanair happen get notif departur cancel flightat tomorrow take 2 hrs talk someon',
 'southwestair awwweesssooome',
 'jetblu thank much condol quick respons much appreci',
 'usairway 3rd time cut 10 minut hold chairman desk wth suppo

So, we should be aiming for better than 80% accuracy which is what we would get if we naively predicted negative for everything.

#### Q3.a) 
The first thing we'll try is the simple baseline of a Bag of Words model. Use sklearn's CountVectorizer.

In [32]:
# fit and transform the preprocessed train and dev data with CountVectorizer
#Encode the lables y_train and y_dev
vec = CountVectorizer()
X_train_vec = vec.fit_transform(X_train_pre)
X_dev_vec = vec.transform(X_dev_pre)
# YOUR CODE HERE
#raise NotImplementedError()
X_train_vec,X_dev_vec

(<8078x8211 sparse matrix of type '<class 'numpy.int64'>'
 	with 82810 stored elements in Compressed Sparse Row format>,
 <1731x8211 sparse matrix of type '<class 'numpy.int64'>'
 	with 16579 stored elements in Compressed Sparse Row format>)

In [33]:
assert len(vec.vocabulary_) == 8211
assert 'awwweesssooome' in vec.vocabulary_
assert vec.vocabulary_['awwweesssooome'] == 1544
assert hashlib.sha256(' '.join(vec.get_feature_names()[-20:]).encode()).hexdigest() == '585a48103c34d5be5b489f3aef9534bf02dde2e56949e5a48b70e95c5e834304'
assert ' '.join([str(i) for i in X_train_vec[11].indices]) == '3601 1313 3798 5322 2651 1991 3355 7285 7042 3991 7050 6701'
assert ' '.join([str(i) for i in X_train_vec[11].data]) == '1 1 1 1 1 1 1 1 1 1 1 1'
assert ' '.join([str(i) for i in X_dev_vec[1111].indices]) == '1287 1313 1591 1977 2583 3339 3342 4029 4506 4525 4886 5344 8046'
assert ' '.join([str(i) for i in X_dev_vec[1111].data]) == '1 1 1 1 1 1 1 1 1 1 1 1 1'

Let's take a look at some of the words in our vocabulary:

In [34]:
print(vec.get_feature_names()[:10])
print(vec.get_feature_names()[1000:1010])
print(vec.get_feature_names()[5000:5010])

['00', '000', '000ft', '000lbs', '0016', '00am', '00p', '00pm', '01', '0162389030167']
['8wbzorrn3c', '8x7xvm', '90', '900', '900s', '904am', '90min', '910', '912', '9148445695']
['mk', 'mke', 'mkpognntyc', 'mkt', 'mktg', 'mkwlkr', 'ml', 'ml1jacpmch', 'mli', 'mmm']


A lot of the words seem pretty random, we might not need them at all. But let's come back to this later and get our baseline.

In [35]:
# finally, train a Multinomial Naive Bayes classifier on train and predict on dev
# store the classifier in a variable clf
# return the dev set predictions in a variable y_dev_pred
# YOUR CODE HERE
le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
#le.fit(y_dev)
y_dev = le.transform(y_dev)
clf = MultinomialNB()
clf.fit(X_train_vec,y_train)
# Train the classifier
y_dev_predn= clf.predict(X_dev_vec)
y_dev_pred = []
for num in y_dev_predn:
    if num ==1:
        pre = 'positive'
    else:
        pre = 'negative'
    y_dev_pred.append(pre)    
    
#raise NotImplementedError()

In [36]:
assert_allclose(clf.intercept_, np.array([-1.57572207]), rtol=1e-3)
assert ' '.join(y_dev_pred[10:20]) == 'negative positive negative negative negative negative positive negative negative negative'

In [37]:
y_dev_pred = y_dev_predn
# check the results
print(classification_report(y_dev, y_dev_pred))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      1380
           1       0.85      0.63      0.72       351

    accuracy                           0.90      1731
   macro avg       0.88      0.80      0.83      1731
weighted avg       0.90      0.90      0.90      1731



In [38]:
# we should also look at some misclassified examples
for text, pred, true in zip(X_dev_pre[:50], y_dev_pred[:50], y_dev[:50]):
    if pred != true:
        print(f"Sentence: {text}")
        print(f"Predicted: {pred}, Actual: {true}\n")

Sentence: americanair exact ill fli aa dalla airlin trust
Predicted: 0, Actual: 1

Sentence: delet account jetblu
Predicted: 1, Actual: 0

Sentence: unit flight rsw tonight amp twin 3 year old pilot row stay help get boy amp bag lifesav
Predicted: 0, Actual: 1

Sentence: southwestair holi fuckinf shit
Predicted: 0, Actual: 1



We beat the naivest baseline of always guessing negative, but let's see if we can do better!

#### Q3.c)
Now let's try ngrams instead of plain unigrams to see if we get a boost in performance. But first, let's streamline the process in a nice function.

In [39]:
def train_and_validate(X_train, X_dev, y_train, y_dev, ngram_range=(1,1), max_features=None):
    """
    Train a model using sklearn's Pipeline and return it along with the predictions and the
    current accuracy in the validation set. Print the classification report as well.
    Assume the documents are already preprocessed
    
    Args:
    X_train - preprocessed tweets in training data
    X_dev - preprocessed tweets in dev data
    y_train - labels of training data
    y_dev - labels of dev data
    ngram_range - ngram range to use in CountVectorizer (tuple)
    max_features - max number of features to use in CountVectorizer (int)
    """
    
    # Build the pipeline containing the countvectorizer and the multinomial NB classifier
    # text_clf = Pipeline(...)
    
    # Train the classifier
    # (...)

    # y_dev_pred = (...)
    # print the classification report
    # acc = (...)
    
    # YOUR CODE HERE
    
    text_clf = Pipeline([('vect',CountVectorizer(ngram_range=ngram_range, max_features= max_features)),
                   ('clf', MultinomialNB())])
# Train the classifier
    text_clf.fit( X_train, y_train)

    y_dev_pred = text_clf.predict( X_dev)
#np.mean(predicted == validation_df['sentiment'])
    #raise NotImplementedError()
    #acc = classification_report(y_dev, y_dev_pred)
    acc = np.mean(y_dev_pred == y_dev)
    #print(classification_report(y_dev, y_dev_pred))
    y_dev_predn=[]
    for num in y_dev_pred:
        if num ==1:
            pre = 'positive'
        else:
            pre = 'negative'
        y_dev_predn.append(pre)   
    y_dev_pred = y_dev_predn
    
    return text_clf,y_dev_pred,acc
    # YOUR CODE HERE
    #raise NotImplementedError()

In [40]:
clf, y_dev_pred, acc = train_and_validate(X_train_pre, X_dev_pre, y_train, y_dev)
acc

0.901790872328134

In [41]:
clf, y_dev_pred, acc = train_and_validate(X_train_pre, X_dev_pre, y_train, y_dev)

# check same as before
assert_allclose(clf['clf'].intercept_, np.array([-1.57572207]), rtol=1e-3)
assert ' '.join(y_dev_pred[10:20]) == 'negative positive negative negative negative negative positive negative negative negative'
assert_allclose(acc, 0.9024, rtol=1e-2)

#### Q3.d)
Now try with both unigrams and bigrams.

In [42]:
# run train_and_validate() but with the correct ngram range to have both unigrams and bigrams in the vocabulary
# clf, y_dev_pred, acc = ...
# YOUR CODE HERE
clf, y_dev_pred, acc = train_and_validate(X_train_pre, X_dev_pre, y_train, y_dev,ngram_range=(1,2), max_features=None)
acc
#raise NotImplementedError()

0.8902368573079145

In [43]:
assert_allclose(clf['clf'].intercept_, np.array([-1.57572207]), rtol=1e-3)
assert ' '.join(y_dev_pred[10:20]) == 'positive negative negative negative negative negative positive negative negative negative'
assert_allclose(acc, 0.8908, rtol=1e-2)
assert len(clf['vect'].vocabulary_) == 60813

#### Q3.e)
Find the top 20 most common vocabulary items in the training data. Hint: don't forget how the countvectorizer BoW matrix is actually structured, and how you may have to combine the rows to get the information you need.

In [46]:
# transform the preprocessed training data again
# return the a list of tuples containing the top 20 most common ngrams and their counts (ngram, count)

# Extract the corresponding word and count
##counts = [(inv_map[i], X_train_vec[0, i]) for i in columns]

##for word, count in counts:
   ## print(word, ": ", count)
# top_20_ngrams = ...
# YOUR CODE HERE
vec = CountVectorizer()
X_train_vec = vec.fit_transform(X_train_pre)
X_vec_sum = X_train_vec.sum(axis=0)
df = pd.DataFrame(X_vec_sum.reshape(-1,1), index=vec.get_feature_names(), columns=["words"]) 
df = df.sort_values(by=["words"],ascending=False)
top_20_ngrams = list(df. to_records(index=True))[:20]
top_20_ngrams
#raise NotImplementedError()

[('flight', 2844),
 ('unit', 2373),
 ('usairway', 1875),
 ('americanair', 1725),
 ('southwestair', 1236),
 ('jetblu', 1180),
 ('thank', 1060),
 ('get', 975),
 ('hour', 814),
 ('cancel', 678),
 ('delay', 666),
 ('servic', 647),
 ('time', 633),
 ('help', 604),
 ('custom', 600),
 ('call', 518),
 ('wait', 515),
 ('co', 494),
 ('bag', 485),
 ('hold', 475)]

In [45]:
assert top_20_ngrams == [('flight', 2844),
                         ('unit', 2373),
                         ('usairway', 1875),
                         ('americanair', 1725),
                         ('southwestair', 1236),
                         ('jetblu', 1180),
                         ('thank', 1060),
                         ('get', 975),
                         ('hour', 814),
                         ('cancel', 678),
                         ('delay', 666),
                         ('servic', 647),
                         ('time', 633),
                         ('help', 604),
                         ('custom', 600),
                         ('call', 518),
                         ('wait', 515),
                         ('co', 494),
                         ('bag', 485),
                         ('hold', 475)]

AssertionError: 

#### Q3.f)
We saw with just unigrams that there were already a lot of unhelpful words in the vocabulary, and now with the addition of bigrams the vocabulary is much bigger. Let's get rid of some infrequent ngrams by limiting the max number of features.

In [None]:
# run train_and_validate() with uni- and bigrams and max features of 20000
# clf, y_dev_pred, acc = ...
# YOUR CODE HERE
clf, y_dev_pred, acc = train_and_validate(X_train_pre, X_dev_pre, y_train, y_dev,ngram_range=(1,2), max_features=20000)
#raise NotImplementedError()

In [None]:
assert ' '.join(y_dev_pred[10:20]) == 'negative negative negative negative negative negative positive negative negative negative'
assert_allclose(acc, 0.9076, rtol=1e-2)
assert len(clf['vect'].vocabulary_) == 20000

The metrics improved a little! Let's see if we can get even better performance now by using the relative importance of ngrams with TfIdf.

## Q4. TfIdf
#### Q4.a)
First, rewrite the train and validate function from before to include the TfIdf step in the Pipeline. Use sklearn's `TfIdfTransformer`. Also, add kwargs for CountVectorizer's `max_df` and `min_df`.

In [None]:
def train_and_validate_with_tfidf(X_train, X_dev, y_train, y_dev, ngram_range=(1,1), max_features=None, max_df=1.0, min_df=1):
    """
    Train a model using sklearn's Pipeline and return it along with the predictions and the
    current accuracy in the validation set. Print the classification report as well.
    Assume the documents are already preprocessed
    
    Args:
    X_train - preprocessed tweets in training data
    X_dev - preprocessed tweets in dev data
    y_train - labels of training data
    y_dev - labels of dev data
    ngram_range - ngram range to use in CountVectorizer (tuple)
    max_features - max number of features to use in CountVectorizer (int)
    max_df = max_df for CountVectorizer (int or float)
    min_df = min_df for CountVectorizer (int or float)
    """
    
    # Build the pipeline containing the countvectorizer and the multinomial NB classifier
    # text_clf = Pipeline(...)
    
    # Train the classifier
    # (...)

    # y_dev_pred = (...)
    # print the classification report
    # acc = (...)
    
    # YOUR CODE HERE
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=ngram_range, max_features= max_features,max_df=max_df, min_df=min_df)),
                         ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB())])
    #Train the classifier
    text_clf.fit( X_train, y_train)

    y_dev_pred = text_clf.predict( X_dev)
#np.mean(predicted == validation_df['sentiment'])
    #raise NotImplementedError()
    #acc = classification_report(y_dev, y_dev_pred)
    acc = np.mean(y_dev_pred == y_dev)
    #print(classification_report(y_dev, y_dev_pred))
    y_dev_predn=[]
    for num in y_dev_pred:
        if num ==1:
            pre = 'positive'
        else:
            pre = 'negative'
        y_dev_predn.append(pre)   
    y_dev_pred = y_dev_predn
    
    return text_clf,y_dev_pred,acc
    # YOUR CODE HERE
    #raise NotImplementedError()

In [None]:
clf, y_dev_pred, acc = train_and_validate_with_tfidf(X_train_pre, X_dev_pre, y_train, y_dev, max_df=0.95, min_df=0.05)

assert ' '.join(y_dev_pred[10:20]) == 'negative negative negative negative negative negative positive negative negative negative'
assert_allclose(acc, 0.8492, rtol=1e-2)

#### Q4.b)
Next, find the top 20 most important words in the training tweets according to TfIdf. The solution should be similar to how you found the most common words from CountVectorizer.

In [None]:
# don't forget to re-transform the preprocessed training data
# top_20_most_important_words = ...
# do not return the scores, just the words
# YOUR CODE HERE

# YOUR CODE HERE
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X_train_vec = vec.fit_transform(X_train_pre)
X_vec_sum = X_train_vec.sum(axis=0)
df = pd.DataFrame(X_vec_sum.reshape(-1,1), index=vec.get_feature_names(), columns=["words"]) 
df = df.sort_values(by=["words"],ascending=False)
top_20_most_important_words = df.index[:20]
#top_20_ngrams = 

#raise NotImplementedError()

In [None]:
assert top_20_most_important_words == ['unit',
                                         'flight',
                                         'usairway',
                                         'americanair',
                                         'southwestair',
                                         'jetblu',
                                         'thank',
                                         'get',
                                         'hour',
                                         'delay',
                                         'time',
                                         'servic',
                                         'help',
                                         'cancel',
                                         'custom',
                                         'bag',
                                         'call',
                                         'wait',
                                         'plane',
                                         'co']

You can see that the most important words from TfIdf are slightly different from the most common words from the BoW. The metrics we got using TfIdf may have gotten worse, but with a bit more tuning maybe we can get get better performance.

#### Q4.c)

Use the `train_and_validate_with_tfidf` function you created before to train with different hyperparameters and get an accuracy score above 88.5% on the validation dataset. (This threshold is below what we got for plain CountVectorizer)


In [None]:
# clf, _, acc = train_and_validate_with_tfidf(...)
# YOUR CODE HERE
clf, y_dev_pred, acc = train_and_validate_with_tfidf(X_train_pre, X_dev_pre, y_train, y_dev,ngram_range=(1,2), max_features=2000, max_df=0.99, min_df=1)
acc
#raise NotImplementedError()

In [None]:
assert(acc >= 0.885)

Now evaluate your model on the test set!

In [None]:
X_test_vec = clf['tfidf'].transform(clf['vect'].transform(X_test_pre))
y_test_pred = clf['clf'].predict(X_test_vec)
#print(classification_report(y_test, y_test_pred))

We ended up not being able to beat our baseline of BoW with Tfidf, maybe because the dataset is small and simple, and a simple algorithm was enough. Still, in general it's good to try TfIdf for text classification tasks and have an understanding of how your results change with different hyperparameters!