In [8]:
# import the packages

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet

In [6]:
nltk.download("wordnet")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /Users/sraaf/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sraaf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sraaf/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# import dataset

df = pd.read_csv('available_csv_file')

In [None]:
# Create 2 variables of the columns in the dataframe

inputs = df['text']
labels = df['labels']

In [None]:
# Check for imbalanced classes. Over or under-represented classes can be an issue when checking the models performance

labels.hist(figsize = (10, 5));

In [None]:
# Train test split 

inputs_train, inputs_test, Ytrain, Ytest = train_test_split(inputs, labels, random_state = 123)

In [None]:
# instintiate countvector object

vectorizer = CountVectorizer()

In [None]:
Xtrain = vectorizer.fit_transform(inputs_train)

In [None]:
Xtest = vectorizer.transform(inputs_test)     # We do not fit test data as it is meant to 
                                              # represent the data we want to apply the model to 

In [None]:
# what percentage of values are non-zero?

(Xtrain != 0).sum() / np.prod(Xtrain.shape)      # number of non-zero values divided by the shape of the dataframe

In [None]:
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))     # returns the accuracy
print("test score:", model.score(Xtest, Ytest))

In [None]:
# Including param stopwords

vectorizer = CountVectorizer(stop_words = 'english')
Xtrain = vectorizer.fit_transform(inputs_train)
Xtest = vectorizer.transform(inputs_test)  
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

In [None]:
# Create function for mapping POS tags in nltk

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return word_net.NOUN

In [None]:
# create class for tokenizing and lemmatizing

class LemmaTokenizer:     # create an object
    def __init__(self):       
        self.wnl = WordNetLemmatizer()             # instantiate a word net lemmatizer object
    def __call__(self, doc):                       # define the call function that takes the document as argument
        tokens = word_tokenize(doc)
        words_and_tags = nltk.pos_tags(tokens)     # Convert document into tokens
        return [self.wnl.lemmatize(word, pos = get_wordnet_pos(tag)) \
               for word, tag in words_and_tags]    # obtain parts of speech tags and return list of tuples containing each word and corresponding tag

In [None]:
# Run model with created object
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer)
Xtrain = vectorizer.fit_transform(inputs_train)
Xtest = vectorizer.transform(inputs_test)  
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

In [None]:
class StemTokenizer:     # create an object
    def __init__(self):       
        self.porter = PorterStemmer()             # instantiate a word net lemmatizer object
    def __call__(self, doc):                      # define the call function that takes the document as argument
        tokens = word_tokenize(doc)               
        return [self.porter.stem(t) for t in tokens]    # returns list of stemmed tokens

In [None]:
# Run model with created object
vectorizer = CountVectorizer(tokenizer = StemTokenizer)
Xtrain = vectorizer.fit_transform(inputs_train)
Xtest = vectorizer.transform(inputs_test)  
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

In [None]:
def simple_tokenizer(s):
    return s.split()

In [None]:
# Run model with created object
vectorizer = CountVectorizer(tokenizer = simple_tokenizer)
Xtrain = vectorizer.fit_transform(inputs_train)
Xtest = vectorizer.transform(inputs_test)  
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))