# First Kaggle Competition
This is my first Kaggle competition.  In lieu of a fianl examination for our Oregon State University machine learning course (CS-434), we have been given the opportunity of participating in this competition.  This is also my first Notebook so please feel free to give me pointers.

For this competition I've tried to employ two methods utilizing scikit-learn (sklearn) Multinomial Naive Bayes and Support Vector Machine (SVM).  I will try to reproduce the code in this notebook.

# Acknowledgements
* OSU CS-434 Instructor and TAs, a lot of this code comes from sample code provided in class.
* scikit-learn, https://scikit-learn.org/stable/about.html#citing-scikit-learn
* Raenish David, https://www.kaggle.com/raenish/cheatsheet-text-helper-functions
* Article: https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34, written by Gunjit Bedi, entitled "A guide to Text Classification (NLP) using SVM and Naive Bayes with Python" on November 9, 2018

# Input

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

trainFile = '/kaggle/input/tweet-sentiment-extraction/train.csv'
testFile = '/kaggle/input/tweet-sentiment-extraction/test.csv'
sample = '/kaggle/input/tweet-sentiment-extraction/sample_submission.csv'
outTrain = 'train_submission.csv'
outFile = 'submission.csv'

df_train = pd.read_csv(trainFile, delimiter=',', dtype=str)
df_test = pd.read_csv(testFile, delimiter=',', dtype=str)
df_list = [df_train[['textID', 'text', 'sentiment']], df_test]
df_all = pd.concat(df_list, ignore_index=True)
sentList = df_all['sentiment'].unique()

# Constants

In [None]:
# Model, Vectorization, Test, and Tune Flags
tune = False
test = False
method = False # True selects MultinomialNB, False selects SVM
# end of training data / start of testing data
trainIdx = 27481

# Tuning variables
if method:
    # Training Accuracy: 0.7071
    # Testing Accuracy: 0.3212
    # Jaccard Score (on Training): 0.5207
    max_df = 1.0
    min_df = 9  # maybe try higher?
    max_feat = 2000
    alpha = 6.0
else:
    # Training Accuracy: 0.7459
    # Testing Accuracy:  0.3356
    # Jaccard Score (on Training): 0.5030
    max_df = 1.0     # 0.1
    min_df = 12      # 14
    max_feat = 3000  # 2000
    c = 0.7          # 0.7


# Utility Functions
This is just a collection of functions that are used by the main program and both methods.
## Text Cleanup
This function mainly uses regex to remove characters that are not always helpful for determining sentiment.  Unfortunately, it also removes some punctuation (e.g., \*\*\*\*\*) which may be helful for determining sentiment.  So lots of room for improvement.

In [None]:
def cleanText(text):
    '''
    Some useful hints of cleaning text up using regex:
    https://www.kaggle.com/raenish/cheatsheet-text-helper-functions
    '''
    import re
    import string
    # convert text to lowercase and remove spaces at begining and end of string
    text = str(text).strip().lower()
    # remove links
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    # remove HTML tags
    text = re.sub(r'<.*?>', "", text)
    # remove the character [\]
    text = re.sub(r"\\", "", text)
    # remove the character [']
    text = re.sub(r"\'", "", text)
    # remove the character ["]
    text = re.sub(r"\"", "", text)
    # remove text in square brackets []
    text = re.sub(r'\[.*?\]', '', text)
    # remove words that have numbers
    text = re.sub(r'\w*\d\w*', '', text)
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove punctuation characters
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

## Vectorization
This function converts the input DataFrame to a bag of words, removes nulls and stores the index of nulls, and vectorizes the text. 

In [None]:
def vectorizeIt(df, textCol, trainIdx, maxFeat=2000, maxDF=1.0, minDF=0.0):
    from sklearn.feature_extraction.text import CountVectorizer
    import pandas as pd
    import numpy as np
    # this vectorizer will skip stop words
    vectorizer = CountVectorizer(stop_words="english", \
        preprocessor=cleanText, max_features=maxFeat, \
        max_df=maxDF, min_df=minDF)
    # drop nan's
    index = list(np.where(df[textCol].isnull())[0])
    new_df = df[textCol].dropna(axis=0)
    # fit the vectorizer on the text
    vectorizer.fit(new_df)
    # get the vocabulary
    inv_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}
    vocabulary = [inv_vocab[i] for i in range(len(inv_vocab))]
    wc = vectorizer.fit_transform(new_df)
    del new_df
    idxTr = [i for i in index if i < trainIdx]
    sz = trainIdx - len(idxTr)
    xTrain = wc[:sz]
    xTest = wc[sz:]
    return index, vocabulary, xTrain, xTest

## Generate the y values
This function creates an array of integer values coresponding to the sentiment values (0 neutral, 1 negative, 2 positive).

In [None]:
def sentArray(df, textCol):
    import numpy as np
    import pandas as pd
    y = np.zeros(shape=(df[textCol].size), dtype=np.int)
    words = df[textCol].unique()
    for i in range(len(df)):
       s = df[textCol].iloc[i]
       t = np.where(words==s)
       y[i] = t[0]
    return y

## Calculate the Jaccard Score
I used the training data to try and tune the vectorization parameters to improve the Jaccard score.  I made some assumptions from the code posted (https://www.kaggle.com/c/tweet-sentiment-extraction/overview/evaluation) in that occasionally the intersection of the ground truth and predicted value was zero, so I added a check for that.  Also this strips away the quotations from the selected text DataFrame values.

In [None]:
def jaccard(str1, str2):
    import re
    # make non strings string (e.g., nan)
    str1 = str(str1)
    str2 = str(str2)
    # remove the character ["]
    if len(str1) > 0:
        str1 = re.sub(r"\"", "", str1)
    if len(str2) > 0:
        str2 = re.sub(r"\"", "", str2)
    # https://www.kaggle.com/c/tweet-sentiment-extraction/overview/evaluation
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    if len(c) > 0:
        jacc = float(len(c)) / (len(a) + len(b) - len(c))
    else:
        jacc = 0.0
    return jacc

## Plots for Tuning
This is just a simple ploting function for tuning.

In [None]:
def plotIt(x, y, name, title, xLabel, yLabel, label1, y2=None, label2=None):
    import matplotlib.pyplot as plt
    if len(y2) > 0:
        plt.plot(x, y, 'bo-', label=label1)
        plt.plot(x, y2, 'rs-', label=label2)
    else:
        plt.plot(x, y, 'bo-')
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)
    plt.title(title)
    plt.legend()
    plt.savefig(name + '.png', format='png')
    plt.close()

## Eliminate Null Indicies from y Data
When we remove the null value (index 314) from the training data it needs to be removed from the sentiment array too.  This also assumes that it's possible the testing data may have null's too, so this tries to account for that.

In [None]:
def yPrep(index, trainIdx, train, test):
    import numpy as np
    # Convert sentiment values to y array of integers
    idxTr = [i for i in index if i < trainIdx]
    yTr = sentArray(train, 'sentiment')
    yTrain = np.delete(yTr, idxTr)
    idxTe = [i - trainIdx for i in index if i > trainIdx]
    yTe = sentArray(test, 'sentiment')
    yTest = np.delete(yTe, idxTe)
    trainIdx = trainIdx - len(idxTr)
    return trainIdx, yTrain, yTest, idxTr, idxTe

# Multinomial Naive Bayes
## Helper Function
This function calls the sklearn multinomial Naive Bayes classifier.  It returns the accuracy of the training and testing along with the object (clf) itself for use of other functions and variables (e.g., feature_log_prob_).

In [None]:
def multiNB(alpha, xTrain, xTest, yTrain, yTest):
    from sklearn.naive_bayes import MultinomialNB
    '''
    https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB
    alpha:  float, default=1.0
            Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
    '''
    # Send through sklearn mulinomial Naive Bayes classifier
    clf = MultinomialNB(alpha=alpha)
    clf.fit(xTrain,yTrain)
    accTrain = clf.score(xTrain, yTrain)
    accTest = clf.score(xTest, yTest)
    return clf, accTrain, accTest

## Tuning Function
This function was used to attempt to tune the hyperparameters, mainly alpha, but it was also used to get some of the vectorization numbers as well.  This function employs sklearn model selection to generate validation data.

In [None]:
def tuneNB(all, train, test, trainIdx, max_feat, max_df, min_df, alpha):
    import numpy as np
    from sklearn.model_selection import train_test_split
    print("Tune MultinomialNB...")
    # set the defaults
    cvSz = 5 # iterations of cross validation
    # flags for individual tuning
    tune_max_df = True
    tune_min_df = True
    tune_max_feat = True
    tune_alpha = True
    # max_df
    if tune_max_df:
        max_df = np.arange(start=1.0, stop=0.0, step=-0.1)
        accTrainAll = np.zeros(shape=(cvSz, len(max_df)))
        accValidAll = np.zeros(shape=(cvSz, len(max_df)))
        for j in range(len(max_df)):
            for i in range(cvSz):
                tIdx = trainIdx
                # Create bag of words and word count
                index, bow, xTrain, xTest = vectorizeIt(all, 'text', trainIdx, \
                    maxFeat=max_feat, maxDF=max_df[j], minDF=min_df)
                # Convert sentiment values to y array of integers
                tIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
                    tIdx, train, test)
                # create cross validation data
                xTr, xVld, yTr, yVld = train_test_split(xTrain, yTrain, \
                    test_size=0.33, random_state=i, shuffle=True)
                # Send through sklearn mulinomial Naive Bayes classifier
                clf, accTrainAll[i,j], accValidAll[i,j] = multiNB(alpha, xTr, \
                    xVld, yTr, yVld)
        accTrain = accTrainAll.mean(axis=0)
        accValid = accValidAll.mean(axis=0)
        # plot the results
        name = 'NB_max_df_v_acc'
        title = 'MultinomialNB: max_df vs. Accuracy'
        xLabel = 'max_df'
        yLabel = 'Accuracy'
        label1 = 'Train'
        label2 = 'Valid'
        plotIt(max_df, accTrain, name, title, xLabel, yLabel, label1, accValid,\
            label2)
        max_df = max_df[np.argmax(accValid)]
        print("max_df: " + "{:.3f}".format(max_df))
    # min_df
    if tune_min_df:
        min_df = np.arange(start=1, stop=20, step=1)
        accTrainAll = np.zeros(shape=(cvSz, len(min_df)))
        accValidAll = np.zeros(shape=(cvSz, len(min_df)))
        for j in range(len(min_df)):
            for i in range(cvSz):
                tIdx = trainIdx
                # Create bag of words and word count
                index, bow, xTrain, xTest = vectorizeIt(all, 'text', trainIdx, \
                    maxFeat=max_feat, maxDF=max_df, minDF=min_df[j])
                # Convert sentiment values to y array of integers
                tIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
                    tIdx, train, test)
                # create cross validation data
                xTr, xVld, yTr, yVld = train_test_split(xTrain, yTrain, \
                    test_size=0.33, random_state=i, shuffle=True)
                # Send through sklearn mulinomial Naive Bayes classifier
                clf, accTrainAll[i, j], accValidAll[i,j] = multiNB(alpha, xTr, \
                    xVld, yTr, yVld)
        # get mean score
        accTrain = accTrainAll.mean(axis=0)
        accValid = accValidAll.mean(axis=0)
        # plot the results
        name = 'NB_min_df_v_acc'
        title = 'MultinomialNB: min_df vs. Accuracy'
        xLabel = 'min_df'
        yLabel = 'Accuracy'
        label1 = 'Train'
        label2 = 'Valid'
        plotIt(min_df, accTrain, name, title, xLabel, yLabel, label1, accValid,\
            label2)
        min_df = min_df[np.argmax(accValid)]
        print("min_df: " + "{:.3f}".format(min_df))
    # max_feat
    if tune_max_feat:
        max_feat = np.arange(start=1000, stop=26000, step=1000)
        accTrainAll = np.zeros(shape=(cvSz, len(max_feat)))
        accValidAll = np.zeros(shape=(cvSz, len(max_feat)))
        for j in range(len(max_feat)):
            for i in range(cvSz):
                tIdx = trainIdx
                # Create bag of words and word count
                index, bow, xTrain, xTest = vectorizeIt(all, 'text', trainIdx, \
                    maxFeat=max_feat[j], maxDF=max_df, minDF=min_df)
                # Convert sentiment values to y array of integers
                tIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
                    tIdx, train, test)
                # create cross validation data
                xTr, xVld, yTr, yVld = train_test_split(xTrain, yTrain, \
                    test_size=0.33, random_state=i, shuffle=True)
                # Send through sklearn mulinomial Naive Bayes classifier
                clf, accTrainAll[i,j], accValidAll[i,j] = multiNB(alpha, xTr, \
                    xVld, yTr, yVld)
        # get mean score
        accTrain = accTrainAll.mean(axis=0)
        accValid = accValidAll.mean(axis=0)
        # plot the results
        name = 'NB_feat_v_acc'
        title = 'MultinomialNB: max_feat vs. Accuracy'
        xLabel = 'max_feat'
        yLabel = 'Accuracy'
        label1 = 'Train'
        label2 = 'Valid'
        plotIt(max_feat, accTrain, name, title, xLabel, yLabel, label1, \
            accValid, label2)
        max_feat = max_feat[np.argmax(accValid)]
        print("max_feat: " + "{:.2f}".format(max_feat))
    # alpha
    if tune_alpha:
        alpha = np.arange(start=1.0, stop=21.0, step=1.0)
        accTrainAll = np.zeros(shape=(cvSz, len(alpha)))
        accValidAll = np.zeros(shape=(cvSz, len(alpha)))
        for j in range(len(alpha)):
            for i in range(cvSz):
                tIdx = trainIdx
                # Create bag of words and word count
                index, bow, xTrain, xTest = vectorizeIt(all, 'text', trainIdx, \
                    maxFeat=max_feat, maxDF=max_df, minDF=min_df)
                # Convert sentiment values to y array of integers
                tIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
                    tIdx, train, test)
                # create cross validation data
                xTr, xVld, yTr, yVld = train_test_split(xTrain, yTrain, \
                    test_size=0.33, random_state=i, shuffle=True)
                # Send through sklearn mulinomial Naive Bayes classifier
                clf, accTrainAll[i,j], accValidAll[i,j] = multiNB(alpha[j], \
                    xTr, xVld, yTr, yVld)
        accTrain = accTrainAll.mean(axis=0)
        accValid = accValidAll.mean(axis=0)
        # plot the results
        name = 'NB_alpha_v_acc'
        title = 'MultinomialNB: alpha vs. Accuracy'
        xLabel = 'alpha'
        yLabel = 'Accuracy'
        label1 = 'Train'
        label2 = 'Valid'
        plotIt(alpha, accTrain, name, title, xLabel, yLabel, label1, accValid,\
            label2)
        alpha = alpha[np.argmax(accValid)]
        print("alpha: " + "{:.2f}".format(alpha))

## Generate Output DataFrame
This function generates the output DataFrame used to write the submission.csv and the train_submission.csv.  This function also needs a lot of work, specifically in the logic that selects words for positive and negative sentiment.

In [None]:
def buildSelTextNB(df, test, idxList, pred, bow, featLogProb, sentList):
    import pandas as pd
    '''
    neutral == 0
    negative == 1
    positive == 2
    '''
    print("Build MultinomialNB output...")
    # create a return DataFrame
    if test:
        retDF = pd.DataFrame(columns=['textID', 'selected_text', 'sentiment'])
        retDF['textID'] = df['textID']
    else:
        retDF = pd.DataFrame(columns=['textID', 'selected_text'])
        retDF['textID'] = df['textID']
    j = 0
    for i in range(len(retDF)):
        # add nan's back in as neutral
        if i in idxList:
            aword = str(df['text'].iloc[i])
            if aword == 'nan':
                retDF['selected_text'].iloc[i] = \
                    '"' + '"'
            else:
                retDF['selected_text'].iloc[i] = \
                    '"' + aword + '"'
            if test: retDF['sentiment'].iloc[i] = 'neutral'
        elif (pred[j] == 0):
            retDF['selected_text'].iloc[i] = \
                '"' + df['text'].iloc[i] + '"'
            if test: retDF['sentiment'].iloc[i] = 'neutral'
            j = j + 1 # advance the prediction index
        else: # Else add selected_text back in based on probability of feature
            outstring = ""
            idxA = pred[j]
            words = df['text'].iloc[i].split()
            for word in words:
                wordCheck = cleanText(word)
                if wordCheck in bow:
                    probWord = (featLogProb)[idxA, bow.index(wordCheck)]
                    probNeu = (featLogProb)[0, bow.index(wordCheck)]
                    probNeg = (featLogProb)[1, bow.index(wordCheck)]
                    probPos = (featLogProb)[2, bow.index(wordCheck)]
                    # check the word probabilities
                    if (((idxA == 1) and (probWord > probPos)) or \
                        ((idxA == 2) and (probWord > probNeg))):
                        if len(outstring) == 0:
                            outstring = word
                        else:
                            outstring = outstring + " " + word
            outstring = '"' + outstring + '"'
            retDF['selected_text'].iloc[i] = outstring
            if test: retDF['sentiment'].iloc[i] = sentList[idxA]
            j = j + 1 # advance the prediction index
    return retDF

# Support Vector Machine (SVM)
## Helper Function
This function calls the sklearn SVM classifier.  I initially tried to use the 'rbf' kernel, which had excellent training accuracy results, but I did not know how to pull the feature weights to support word selection.  So I switched to a linear model which provides (coef_ and decision_function) to calculate the distance of the point from the decision boundary.

In [None]:
def svmClass(xTrain, xTest, yTrain, yTest, c=1.0):
    from sklearn import svm
    '''
    https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    C:      float, default=1.0
            Regularization parameter. The strength of the regularization
            is inversely proportional to C. Must be strictly positive.
            The penalty is a squared l2 penalty.
    kernel: {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
            Specifies the kernel type to be used in the algorithm. It must be 
            one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a 
            callable. If none is given, ‘rbf’ will be used. If a callable is 
            given it is used to pre-compute the kernel matrix from data 
            matrices; that matrix should be an array of 
            shape (n_samples, n_samples).
    gamma:  {‘scale’, ‘auto’} or float, default=’scale’
            Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
            -   if gamma='scale' (default) is passed then it uses 
                1 / (n_features * X.var()) as value of gamma,
            -   if ‘auto’, uses 1 / n_features.
    '''
    # not sure how to get feature liklihood out of rbf
    # clf = svm.SVC(C=c, kernel='rbf', gamma='scale')
    clf = svm.SVC(C=c, kernel='linear')
    clf.fit(xTrain, yTrain)
    accTrain = clf.score(xTrain, yTrain)
    accTest = clf.score(xTest, yTest)
    return clf, accTrain, accTest

## Tuning Function
This function is similar to the Multinomial Naive Bayes tuning execept alpha is replaced by c.

In [None]:
def tuneSVM(xTrain, xTest, yTrain, yTest):
    import numpy as np
    from math import floor
    from sklearn.model_selection import train_test_split
    print('Tune SVM...')
    # set the defaults
    vldSz = 0.129 # percent of training data to make validation set
    cvSz = floor(1.0/vldSz) # iterations of cross validation
    # flags for individual tuning
    tune_max_df = True
    tune_min_df = True
    tune_max_feat = True
    tune_c = True
    # max_df
    if tune_max_df:
        max_df = np.arange(start=1.0, stop=0.0, step=-0.1)
        accTrainAll = np.zeros(shape=(cvSz, len(max_df)))
        accValidAll = np.zeros(shape=(cvSz, len(max_df)))
        for j in range(len(max_df)):
            for i in range(cvSz):
                tIdx = trainIdx
                # Create bag of words and word count
                index, bow, xTrain, xTest = vectorizeIt(all, 'text', trainIdx, \
                    maxFeat=max_feat, maxDF=max_df[j], minDF=min_df)
                # Convert sentiment values to y array of integers
                tIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
                    tIdx, train, test)
                # create cross validation data
                xTr, xVld, yTr, yVld = train_test_split(xTrain, yTrain, \
                    test_size=vldSz, random_state=i, shuffle=True)
                # Send through sklearn SVM classifier
                clf, accTrainAll[i,j], accValidAll[i,j] = svmClass(xTr, xVld, \
                    yTr, yVld, c)
        accTrain = accTrainAll.mean(axis=0)
        accValid = accValidAll.mean(axis=0)
        # plot the results
        name = 'SVM_max_df_v_acc'
        title = 'SVM: max_df vs. Accuracy'
        xLabel = 'max_df'
        yLabel = 'Accuracy'
        label1 = 'Train'
        label2 = 'Valid'
        plotIt(max_df, accTrain, name, title, xLabel, yLabel, label1, accValid, label2)
        max_df = max_df[np.argmax(accValid)]
        print("max_df: " + "{:.3f}".format(max_df))
    # min_df
    if tune_min_df:
        min_df = np.arange(start=1, stop=20, step=1)
        accTrainAll = np.zeros(shape=(cvSz, len(min_df)))
        accValidAll = np.zeros(shape=(cvSz, len(min_df)))
        for j in range(len(min_df)):
            for i in range(cvSz):
                tIdx = trainIdx
                # Create bag of words and word count
                index, bow, xTrain, xTest = vectorizeIt(all, 'text', trainIdx, \
                    maxFeat=max_feat, maxDF=max_df, minDF=min_df[j])
                # Convert sentiment values to y array of integers
                tIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
                    tIdx, train, test)
                # create cross validation data
                xTr, xVld, yTr, yVld = train_test_split(xTrain, yTrain, \
                    test_size=vldSz, random_state=i, shuffle=True)
                # Send through sklearn SVM classifier
                clf, accTrainAll[i, j], accValidAll[i,j] = svmClass(xTr, xVld, \
                    yTr, yVld, c)
        # get mean score
        accTrain = accTrainAll.mean(axis=0)
        accValid = accValidAll.mean(axis=0)
        # plot the results
        name = 'SVM_min_df_v_acc'
        title = 'SVM: min_df vs. Accuracy'
        xLabel = 'min_df'
        yLabel = 'Accuracy'
        label1 = 'Train'
        label2 = 'Valid'
        plotIt(min_df, accTrain, name, title, xLabel, yLabel, label1, accValid, label2)
        min_df = min_df[np.argmax(accValid)]
        print("min_df: " + "{:.3f}".format(min_df))
    # max_feat
    if tune_max_feat:
        max_feat = np.arange(start=1000, stop=26000, step=1000)
        accTrainAll = np.zeros(shape=(cvSz, len(max_feat)))
        accValidAll = np.zeros(shape=(cvSz, len(max_feat)))
        for j in range(len(max_feat)):
            for i in range(cvSz):
                tIdx = trainIdx
                # Create bag of words and word count
                index, bow, xTrain, xTest = vectorizeIt(all, 'text', trainIdx, \
                    maxFeat=max_feat[j], maxDF=max_df, minDF=min_df)
                # Convert sentiment values to y array of integers
                tIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
                    tIdx, train, test)
                # create cross validation data
                xTr, xVld, yTr, yVld = train_test_split(xTrain, yTrain, \
                    test_size=vldSz, random_state=i, shuffle=True)
                # Send through sklearn mulinomial Naive Bayes classifier
                clf, accTrainAll[i,j], accValidAll[i,j] = svmClass(xTr, xVld, \
                    yTr, yVld, c)
        # get mean score
        accTrain = accTrainAll.mean(axis=0)
        accValid = accValidAll.mean(axis=0)
        # plot the results
        name = 'SVM_feat_v_acc'
        title = 'SVM: max_feat vs. Accuracy'
        xLabel = 'max_feat'
        yLabel = 'Accuracy'
        label1 = 'Train'
        label2 = 'Valid'
        plotIt(max_feat, accTrain, name, title, xLabel, yLabel, label1, accValid, label2)
        max_feat = max_feat[np.argmax(accValid)]
        print("max_feat: " + "{:.2f}".format(max_feat))
    # c
    if tune_c:
        # c = np.logspace(start=-3, stop=1, num=5, base=10.0, dtype=np.float)
        c = np.arange(start=0.1, stop=1.1, step=0.1)
        accTrainAll = np.zeros(shape=(cvSz, len(c)))
        accValidAll = np.zeros(shape=(cvSz, len(c)))
        for j in range(len(c)):
            for i in range(cvSz):
                tIdx = trainIdx
                # Create bag of words and word count
                index, bow, xTrain, xTest = vectorizeIt(all, 'text', trainIdx, \
                    maxFeat=max_feat, maxDF=max_df, minDF=min_df)
                # Convert sentiment values to y array of integers
                tIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
                    tIdx, train, test)
                # create cross validation data
                xTr, xVld, yTr, yVld = train_test_split(xTrain, yTrain, \
                    test_size=vldSz, random_state=i, shuffle=True)
                # Send through sklearn mulinomial Naive Bayes classifier
                clf, accTrainAll[i,j], accValidAll[i,j] = svmClass(xTr, xVld, \
                    yTr, yVld, c[j])
        accTrain = accTrainAll.mean(axis=0)
        accValid = accValidAll.mean(axis=0)
        # plot the results
        name = 'SVM_c_v_acc'
        title = 'SVM: c vs. Accuracy'
        xLabel = 'c'
        yLabel = 'Accuracy'
        label1 = 'Train'
        label2 = 'Valid'
        plotIt(c, accTrain, name, title, xLabel, yLabel, label1, accValid, label2)
        c = c[np.argmax(accValid)]
        print("c: " + "{:.2f}".format(c))

## Build Output Dataframe
This function builds the DataFrame which is used to write the submission.csv and train_submission.csv.  It also still needs a lot of work in the logic that selects the positive and negative sentiment words but it seems to be doing better than the Naive Bayes output code.

In [None]:
def buildSelTextSVM(df, test, idxList, pred, bow, sentList, y, w, b, x):
    import pandas as pd
    import numpy as np
    '''
    neutral == 0
    negative == 1
    positive == 2
    '''
    print('Build SVM output...')
    # ||w||
    w_norm = np.zeros(shape=(w.shape[0]))
    for i in range(w.shape[0]):
        w_norm[i] = np.linalg.norm(w[i])
    dist = np.zeros(shape=(x.shape[0]), dtype=np.float64)
    vect = np.zeros(shape=(x.shape), dtype=np.float64)
    for i in range(len(dist)):
        idx = pred[i]
        # Geometric Distance: d = w_t x + b / ||w||
        dist[i] = (y[i,idx] * (np.dot(w[idx].T, x[i]) + b[idx])) / w_norm[idx]
        # vector for each feature?
        vect[i] = dist[i] * (w[pred[i]] / w_norm[idx])
    # create a return DataFrame
    if test:
        retDF = pd.DataFrame(columns=['textID', 'selected_text', 'sentiment'])
        retDF['textID'] = df['textID']
    else:
        retDF = pd.DataFrame(columns=['textID', 'selected_text'])
        retDF['textID'] = df['textID']
    j = 0
    for i in range(len(retDF)):
        # add nan's back in as neutral
        if i in idxList:
            aword = str(df['text'].iloc[i])
            if aword == 'nan':
                retDF['selected_text'].iloc[i] = ""
            else:
                retDF['selected_text'].iloc[i] = aword
            if test: retDF['sentiment'].iloc[i] = 'neutral'
        elif (pred[j] == 0):
            retDF['selected_text'].iloc[i] = df['text'].iloc[i]
            if test: retDF['sentiment'].iloc[i] = 'neutral'
            j = j + 1 # advance the prediction index
        else: # Else add selected_text back in based on distance from hyperplane
            outstring = ""
            idxA = pred[j]
            words = df['text'].iloc[i].split()
            for word in words:
                wordCheck = cleanText(word)
                if wordCheck in bow:
                    iBow = bow.index(wordCheck)
                    if (vect[j, iBow] >= 0.0):                    
                        if len(outstring) == 0:
                            outstring = word
                        else:
                            outstring = outstring + " " + word
            retDF['selected_text'].iloc[i] = outstring
            if test: retDF['sentiment'].iloc[i] = sentList[idxA]
            j = j + 1 # advance the prediction index
    return retDF

# Main Driver
This calls either the multinomialNB or the SVM classifier based on the method flag.  If the tuning flag is set it will only perform the tuning functions for the method selected.  The test flag is for trying to improve the selected text output.

In [None]:
### Method 1: Multinomial Naive Bayes
if method:
    print('Multinomial Naive Bayes Classifier')
    # tune hyperparameters
    if tune:
        tuneNB(df_all, df_train, df_test, trainIdx, max_feat, max_df, \
            min_df, alpha)
    # Perform calculation and output to file
    else:
        # Create bag of words and word count
        index, bow, xTrain, xTest = vectorizeIt(df_all, 'text', \
            trainIdx, maxFeat=max_feat, maxDF=max_df, minDF=min_df)
        # Convert sentiment values to y array of integers
        trainIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
            trainIdx, df_train, df_test)
        # Send through sklearn mulinomial Naive Bayes classifier
        clf, accTrain, accTest = multiNB(alpha, xTrain, xTest, yTrain, yTest)
        predTrain = clf.predict(xTrain)
        predTest = clf.predict(xTest)
        print("Training Accuracy: " + "{:.4f}".format(accTrain))
        print("Testing Accuracy: " + "{:.4f}".format(accTest))
        # Determine selected_text
        testRet_df = buildSelTextNB(df_test, test, idxTe, predTest, bow, \
            clf.feature_log_prob_, sentList)
        if test:
            trainRet_df = buildSelTextNB(df_train, test, idxTr, predTrain, \
                bow, clf.feature_log_prob_, sentList)
        # Write sample_submission
        testRet_df.to_csv(outFile, index=False)
        if test:
            trainRet_df.to_csv(outTrain, index=False)
            sz = len(df_train)
            scoreArr = np.zeros(shape=(sz))
            # calculate Jaccard Score of training data
            for i in range(len(df_train)):
                scoreArr[i] = jaccard(df_train['selected_text'].iloc[i], \
                    trainRet_df['selected_text'].iloc[i])
            print('Jaccard Score: ' + "{:.4f}".format(np.mean(scoreArr)))
### Method 2: SVM
else:
    print('Support Vector Machine Classifier')
    print("max_df:   " + "{:3.5f}".format(max_df))
    print("min_df:   " + "{:8d}".format(min_df))
    print("max_feat: " + "{:7.1f}".format(max_feat))
    print("c:        " + "{:3.5f}".format(c))
    if tune:
        tuneSVM(df_all, df_train, df_test, trainIdx, max_feat, max_df, \
            min_df, c)
    else:
        index, bow, xTrain, xTest = vectorizeIt(df_all, 'text', \
            trainIdx, maxFeat=max_feat, maxDF=max_df, minDF=min_df)
        # Convert sentiment values to y array of integers
        trainIdx, yTrain, yTest, idxTr, idxTe = yPrep(index, \
            trainIdx, df_train, df_test)
        # Send through sklearn SVM classifier
        clf, accTrain, accTest = svmClass(xTrain, xTest, yTrain, yTest, c)
        predTrain = clf.predict(xTrain)
        predTest = clf.predict(xTest)
        print("Training Accuracy: " + "{:.4f}".format(accTrain))
        print("Testing Accuracy: " + "{:.4f}".format(accTest))
        coef = clf.coef_.toarray()
        b = clf.intercept_
        # Determine selected_text
        y = clf.decision_function(xTest)
        testRet_df = buildSelTextSVM(df_test, test, idxTe, predTest, bow, \
            sentList, y, coef, b, xTest.toarray())
        if test:
            y = clf.decision_function(xTrain)
            trainRet_df = buildSelTextSVM(df_train, test, idxTr, predTrain, \
                bow, sentList, y, coef, b, xTrain.toarray())
        # Write sample_submission
        testRet_df.to_csv(outFile, index=False)
        if test:
            trainRet_df.to_csv(outTrain, index=False)
            sz = len(df_train)
            scoreArr = np.zeros(shape=(sz))
            # calculate Jaccard Score of training data
            for i in range(len(df_train)):
                scoreArr[i] = jaccard(df_train['selected_text'].iloc[i], \
                    trainRet_df['selected_text'].iloc[i])
            print('Jaccard Score: ' + "{:.4f}".format(np.mean(scoreArr)))