# Baseline Models

In [1]:
# imports

# dataframe
import pandas as pd

# sklearn

from sklearn.model_selection import train_test_split
# vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# cross validation
from sklearn.model_selection import KFold, cross_validate
# classification report
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
# baseline models
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression

# classification report
from tabulate import tabulate

In [2]:
dataset = pd.read_csv('../Dataset/Tweets.csv', encoding='ISO-8859-1')

### Preprocessing code from exploration.ipynb

In [3]:
# imports for preprocessing

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

nltk.download(['stopwords', 'wordnet', 'omw-1.4'], download_dir='.')
# print('before:', nltk.data.path)
if '.' not in nltk.data.path:
    nltk.data.path.append('.')
# print('after:', nltk.data.path)

# preprocessing text
import regex as re


# drop unneeded columns
dataset.drop(['textID', 'selected_text'], axis=1, inplace=True)


# don't remove some stop words that might make a difference in positive/negative classification
stop_words_keep = {'against', 'before', 'after', 'up', 'down', 'in', 'out', 'on', 'off', 
                    'no', 'nor', 'not', 'only', 'don\'t', 'aren\'t', 'couldn\'t', 'didn\'t', 
                    'doesn\'t', 'hadn\'t', 'hasn\'t', 'isn\'t', 'mightn\'t', 'mustn\'t', 
                    'needn\'t', 'shan\'t', 'shouldn\'t', 'wasn\'t', 'weren\'t', 'won\'t', 
                    'wouldn\'t'}

sw = stopwords.words("english")
stop_words = [w for w in sw if w not in stop_words_keep]


# preprocessing function

def preprocess(text, remove_apos_backtick=True, lemmatize=True, stem=True, rem_len_1=True, rem_stop_words=True):
    
    # create lemmatizer and stemmer.
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    
    # regex patterns.
    url_pattern         = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern        = '@[^\s]+'
    alpha_pattern       = "[^a-zA-Z0-9`\']" # keep back ticks (` used instead of ' in the dataset) and apostrophes (')
    sequence_pattern    = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"
    
    # lowercasing
    text = str(text).lower()
    
    # replace apostrophes and backticks with empty string
    if remove_apos_backtick:
        text = re.sub('[\'`]', '', text)

    # replace all URls with 'URL'
    text = re.sub(url_pattern, ' URL ',text)
         
    # replace @USERNAME to 'USER'.
    text = re.sub(user_pattern, ' USER ', text)        

    # replace all non letters, non numbers (except backticks and apostrophes)
    text = re.sub(alpha_pattern, ' ', text)

    # replace 3 or more consecutive characters by 2 of that character
    text = re.sub(sequence_pattern, seq_replace_pattern, text)

    preproc_text = ''

    # for each word in text
    for word in text.split():
        
        # ignore words of length 1
        if len(word) > 1 or not rem_len_1:

            # lemmatize
            if lemmatize:
                word = lemmatizer.lemmatize(word)

            # check if stopword
            if word not in stop_words or not rem_stop_words:

                # stem 
                if stem:
                    word = stemmer.stem(word)
                    
                preproc_text += (word + ' ')
        
    return preproc_text

[nltk_data] Downloading package stopwords to ....
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ....
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to ....
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
# apply preprocessing to dataset 

dataset['text'] = dataset['text'].apply(lambda t: preprocess(t, stem=True))

### Apply bag-of-words and TF-IDF to generate new features

In [5]:
# split dataset
# remember, don't peek at (evaluate on) the test set!

X_train, X_test, y_train, y_test = train_test_split(dataset.drop(['sentiment'], axis='columns'), dataset['sentiment'], test_size=0.2, random_state=21)

In [6]:
# transform data text feature into new features using bag of words, tf-idf

count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# represent examples as the counts for each word they contain
bag_of_words = count_vectorizer.fit_transform(X_train['text'])

# represent examples as the tf-idf score for each word they contain
tfidf_features = tfidf_vectorizer.fit_transform(X_train['text'])

### K-fold cross validation and classification reports for baseline models

In [7]:
# set up for k-fold cross validation and metric gathering

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision_neut' : make_scorer(precision_score, average=None, labels=['neutral']),
           'precision_pos'  : make_scorer(precision_score, average=None, labels=['positive']),
           'precision_neg'  : make_scorer(precision_score, average=None, labels=['negative']),
           'recall_neut'    : make_scorer(recall_score,    average=None, labels=['neutral']), 
           'recall_pos'     : make_scorer(recall_score,    average=None, labels=['positive']), 
           'recall_neg'     : make_scorer(recall_score,    average=None, labels=['negative']), 
           'f1_score_neut'  : make_scorer(f1_score,        average=None, labels=['neutral']),
           'f1_score_pos'   : make_scorer(f1_score,        average=None, labels=['positive']),
           'f1_score_neg'   : make_scorer(f1_score,        average=None, labels=['negative'])}

k_fold = KFold(n_splits=10)

In [8]:
# function to print out a classification report
# can't use normal classification_report since we want to display our values from cross validation
def custom_classif_report(metric_scores, just_acc=False):

  scores_df = pd.DataFrame(metric_scores)
  
  # average of each metric over the splits of cross validation
  scores_mean = scores_df.mean() 
  
  # overall averages for precision, recall, and f1 across class labels
  avg_precision = scores_mean[['test_precision_neut', 'test_precision_pos', 'test_precision_neg']].mean()
  avg_recall = scores_mean[['test_recall_neut', 'test_recall_pos', 'test_recall_neg']].mean()
  avg_f1 = scores_mean[['test_f1_score_neut', 'test_f1_score_pos', 'test_f1_score_neg']].mean()
  
  print('Classification Report:')

  # precision, recall, f1 metrics in table printable form
  metric_info = {
    'precision': [scores_mean['test_precision_neut'], scores_mean['test_precision_pos'], scores_mean['test_precision_neg'], avg_precision], 
    'recall': [scores_mean['test_recall_neut'], scores_mean['test_recall_pos'], scores_mean['test_recall_neg'], avg_recall], 
    'f1-score': [scores_mean['test_f1_score_neut'], scores_mean['test_f1_score_pos'], scores_mean['test_f1_score_neg'], avg_f1]}
  
  # print table for precision, recall, f1
  if not just_acc:
    print(tabulate(metric_info, headers='keys', tablefmt='fancy_grid', showindex=['neutral', 'positive', 'negative', 'average']))
  
  # print table for accuracy
  acc_info = {'accuracy': [scores_mean['test_accuracy']]}
  print(tabulate(acc_info, headers='keys', tablefmt='fancy_grid'))

In [9]:
mnb_model = MultinomialNB() 

print('Naive bayes with bag of words features ', end='')
scores = cross_validate(mnb_model, bag_of_words, y_train, cv=k_fold, scoring=scoring)
custom_classif_report(scores)

print('\nNaive bayes with tf-idf features ', end='')
scores = cross_validate(mnb_model, tfidf_features, y_train, cv=k_fold, scoring=scoring)
custom_classif_report(scores)

Naive bayes with bag of words features Classification Report:
╒══════════╤═════════════╤══════════╤════════════╕
│          │   precision │   recall │   f1-score │
╞══════════╪═════════════╪══════════╪════════════╡
│ neutral  │    0.601459 │ 0.665177 │   0.631585 │
├──────────┼─────────────┼──────────┼────────────┤
│ positive │    0.716753 │ 0.688228 │   0.702009 │
├──────────┼─────────────┼──────────┼────────────┤
│ negative │    0.673007 │ 0.599565 │   0.634062 │
├──────────┼─────────────┼──────────┼────────────┤
│ average  │    0.66374  │ 0.65099  │   0.655885 │
╘══════════╧═════════════╧══════════╧════════════╛
╒════════════╕
│   accuracy │
╞════════════╡
│   0.654021 │
╘════════════╛

Naive bayes with tf-idf features Classification Report:
╒══════════╤═════════════╤══════════╤════════════╕
│          │   precision │   recall │   f1-score │
╞══════════╪═════════════╪══════════╪════════════╡
│ neutral  │    0.537034 │ 0.824773 │   0.650333 │
├──────────┼─────────────┼──────────┼────

In [10]:
linear_svm_model = LinearSVC(max_iter=10000)

print('Linear SVM with bag of words features ', end='')
scores = cross_validate(linear_svm_model, bag_of_words, y_train, cv=k_fold, scoring=scoring)
custom_classif_report(scores)

print('\nLinear SVM with tf-idf features ', end='')
scores = cross_validate(linear_svm_model, tfidf_features, y_train, cv=k_fold, scoring=scoring)
custom_classif_report(scores)

Linear SVM with bag of words features Classification Report:
╒══════════╤═════════════╤══════════╤════════════╕
│          │   precision │   recall │   f1-score │
╞══════════╪═════════════╪══════════╪════════════╡
│ neutral  │    0.631889 │ 0.658896 │   0.644897 │
├──────────┼─────────────┼──────────┼────────────┤
│ positive │    0.723534 │ 0.71502  │   0.719116 │
├──────────┼─────────────┼──────────┼────────────┤
│ negative │    0.659781 │ 0.627325 │   0.642941 │
├──────────┼─────────────┼──────────┼────────────┤
│ average  │    0.671735 │ 0.66708  │   0.668985 │
╘══════════╧═════════════╧══════════╧════════════╛
╒════════════╕
│   accuracy │
╞════════════╡
│    0.66753 │
╘════════════╛

Linear SVM with tf-idf features Classification Report:
╒══════════╤═════════════╤══════════╤════════════╕
│          │   precision │   recall │   f1-score │
╞══════════╪═════════════╪══════════╪════════════╡
│ neutral  │    0.628901 │ 0.697245 │   0.661162 │
├──────────┼─────────────┼──────────┼──────

In [11]:
logreg_model = LogisticRegression(max_iter=1000)

print('Logistic regresssion with bag of words features ', end='')
scores = cross_validate(logreg_model, bag_of_words, y_train, cv=k_fold, scoring=scoring)
custom_classif_report(scores)

print('\nLogistic regresssion with tf-idf features ', end='')
scores = cross_validate(logreg_model, tfidf_features, y_train, cv=k_fold, scoring=scoring)
custom_classif_report(scores)

Logistic regresssion with bag of words features Classification Report:
╒══════════╤═════════════╤══════════╤════════════╕
│          │   precision │   recall │   f1-score │
╞══════════╪═════════════╪══════════╪════════════╡
│ neutral  │    0.643139 │ 0.726928 │   0.682219 │
├──────────┼─────────────┼──────────┼────────────┤
│ positive │    0.771864 │ 0.722245 │   0.746064 │
├──────────┼─────────────┼──────────┼────────────┤
│ negative │    0.706829 │ 0.624201 │   0.662644 │
├──────────┼─────────────┼──────────┼────────────┤
│ average  │    0.707277 │ 0.691125 │   0.696976 │
╘══════════╧═════════════╧══════════╧════════════╛
╒════════════╕
│   accuracy │
╞════════════╡
│   0.696507 │
╘════════════╛

Logistic regresssion with tf-idf features Classification Report:
╒══════════╤═════════════╤══════════╤════════════╕
│          │   precision │   recall │   f1-score │
╞══════════╪═════════════╪══════════╪════════════╡
│ neutral  │    0.621557 │ 0.773725 │   0.689164 │
├──────────┼───────────