In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn import *
import time
import re, string
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
stop_words = stopwords.words('english')
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import classify
from nltk import NaiveBayesClassifier

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm, tree, ensemble, linear_model
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package stopwords to C:\Users\Sai
[nltk_data]     Nadkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sai
[nltk_data]     Nadkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sai
[nltk_data]     Nadkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sai Nadkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Sai
[nltk_data]     Nadkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
#Constants
anchorTagsRegEx = re.compile("<\/*\w+>")
offKeyCharRegEx = re.compile("[^\sa-zA-Z]+")
trainingPercent = 100
classDividerPercent = 33.3

In [3]:
obamaData = pd.read_excel("training-Obama-Romney-tweets.xlsx", sheet_name='Obama', header=None, names=['Date', 'Time','Tweet', 'Class'], usecols=range(1,5))
obamaData = obamaData.drop([0, 1])
obamaData = obamaData.drop(columns=['Date', 'Time'])
romneyData = pd.read_excel("training-Obama-Romney-tweets.xlsx", sheet_name='Romney', header=None, names=['Date', 'Time','Tweet', 'Class'], usecols=range(1,5))
romneyData = romneyData.drop([0, 1])
romneyData = romneyData.drop(columns=['Date', 'Time'])

In [4]:
#drop mixed class
obamaData["Class"] = pd.to_numeric(obamaData["Class"], downcast = 'integer', errors='coerce')
romneyData["Class"] = pd.to_numeric(romneyData["Class"], downcast = 'integer', errors='coerce')
obamaData = obamaData.dropna()
romneyData = romneyData.dropna()
obamaData = obamaData[obamaData.Class != 2]
romneyData = romneyData[romneyData.Class != 2]

In [5]:
ototalPositiveCount = 0
ototalNegativeCount = 0
ototalNeutralCount = 0
rtotalPositiveCount = 0
rtotalNegativeCount = 0
rtotalNeutralCount = 0

for index, tweet in enumerate(obamaData['Class']):
    if tweet == 1:
        ototalPositiveCount += 1
    elif tweet == -1:
        ototalNegativeCount += 1
    elif tweet == 0:
        ototalNeutralCount += 1

for index, tweet in enumerate(romneyData['Class']):
    if tweet == 1:
        rtotalPositiveCount += 1
    elif tweet == -1:
        rtotalNegativeCount += 1
    elif tweet == 0:
        rtotalNeutralCount += 1

In [6]:
def tokenize(data):
    tokenized = []
    for i in data:
        i = re.sub(anchorTagsRegEx, "", i)
        i = re.sub(offKeyCharRegEx, "", i)
        tokenized.append(tknzr.tokenize(str(i)))
    return tokenized

In [7]:
def lemmatizeTweet(data):
    lemmatizer = WordNetLemmatizer()
    lemmatizedData = []
    
    for tweetTokens in data:
        lemmatizedData.append([])
        for word, tag in pos_tag(tweetTokens):
            if tag.startswith('NN'):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
            lemmatizedData[len(lemmatizedData) - 1].append(lemmatizer.lemmatize(word, pos))
    return lemmatizedData

In [8]:
def noiseRemoval(data):
    cleanData = []
    
    for tweetTokens in data:
        cleanData.append([])
        for token in tweetTokens:
            token = token.lower()
            if token.startswith("http"):
                continue
            
            if token not in string.punctuation and token not in stop_words and len(token) > 1:
                cleanData[len(cleanData) - 1].append(token)
    return cleanData

In [9]:
def vaderfeedback(data):
    newdata = []
    score = 0
    count = 0
    for index, tweet in enumerate(data):
        score = analyzer.polarity_scores(tweet)
        if score['compound'] >= 0.56:
            newdata.append(tweet + " brilliant excellent amazing love")
        elif score['compound'] <= 0.03:
            newdata.append(tweet + " horrible poor hate absymal")
        else:
            newdata.append(tweet + " okay fine meh mediocre")
    return newdata

In [10]:
def preProcessTasks(data, vader):
    if vader:
        data = vaderfeedback(data)
    data = tokenize(data)
    data = lemmatizeTweet(data)
    data = noiseRemoval(data)
    return data

In [11]:
obamaCleaned = preProcessTasks(obamaData['Tweet'], True)
romneyCleaned = preProcessTasks(romneyData['Tweet'], True)

In [12]:
positive = ["brilliant", "excellent", "amazing", "love"]
negative = ["horrible", "poor", "hate", "absymal"]
neutral = ["okay", "fine", "meh", "mediocre"]
posCount = 0
negCount = 0
neuCount = 0

for index, tweet in enumerate(obamaData['Class']):
    if tweet == 1 and set(positive).issubset(obamaCleaned[index]):
        posCount += 1
    elif tweet == -1 and set(negative).issubset(obamaCleaned[index]):
        negCount += 1
    elif tweet == 0 and set(neutral).issubset(obamaCleaned[index]):
        neuCount += 1

print("Pos:", np.round(posCount/ototalPositiveCount, 2)*100)
print("Neg:", np.round(negCount/ototalNegativeCount, 2)*100)
print("Neu:", np.round(neuCount/ototalNeutralCount, 2)*100)
print("Total:", posCount + negCount + neuCount)

posCount = 0
negCount = 0
neuCount = 0
for index, tweet in enumerate(romneyData['Class']):
    if tweet == 1 and set(positive).issubset(romneyCleaned[index]):
        posCount += 1
    elif tweet == -1 and set(negative).issubset(romneyCleaned[index]):
        negCount += 1
    elif tweet == 0 and set(neutral).issubset(romneyCleaned[index]):
        neuCount += 1

print("Pos:", np.round(posCount/rtotalPositiveCount, 2)*100)
print("Neg:", np.round(negCount/rtotalNegativeCount, 2)*100)
print("Neu:", np.round(neuCount/rtotalNeutralCount, 2)*100)
print("Total:", posCount + negCount + neuCount)

Pos: 19.0
Neg: 72.0
Neu: 23.0
Total: 2193
Pos: 15.0
Neg: 70.0
Neu: 22.0
Total: 2636


In [13]:
bothPrezCleaned = preProcessTasks(obamaData['Tweet'].append(romneyData['Tweet']), True)

In [18]:
def overSample(data, labels):
    data = data.toarray()
    smote = SMOTE(random_state=871, n_jobs=-1)
    data, labels = smote.fit_resample(data, labels)
    return data, labels

In [19]:
def vectorize(train, test):
    newtrain = []
    newtest = []
    for listitem in test:
        newtest.append(' '.join(listitem))
    for listitem in train:
        newtrain.append(' '.join(listitem))
    vec = feature_extraction.text.TfidfVectorizer(min_df = 0.00125, max_df = 0.7, sublinear_tf=True, use_idf=True, ngram_range=(1,5))
    train_vector = vec.fit_transform(newtrain)
    test_vector = vec.transform(newtest)
    return train_vector, test_vector

In [20]:
obamaX_train, obamaX_test, obamay_train, obamay_test = model_selection.train_test_split(obamaData['Tweet'], obamaData['Class'], test_size=0.4)
obamaVector, obamaTestVector = vectorize(obamaCleaned, obamaX_test)
obamaOverSampled, obamaOverSampledLabels = overSample(obamaVector, obamaData['Class'])

romneyX_train, romneyX_test, romneyy_train, romneyy_test = model_selection.train_test_split(romneyData['Tweet'], romneyData['Class'], test_size=0.4)
romneyVector, romneyTestVector = vectorize(romneyCleaned, romneyX_test)
romneyOverSampled, romneyOverSampledLabels = overSample(romneyVector, romneyData['Class'])

In [21]:
bothVector, bothTest = vectorize(bothPrezCleaned, bothPrez2Cleaned)
bothOverSampled, bothOverSampledLabels = overSample(bothVector, obamaData['Class'].reset_index(drop=True).append(romneyData['Class'].reset_index(drop=True)))

### Collection of classifiers

In [23]:
models = {
    'Multinomial NB': MultinomialNB(),
    'Decision Tree': tree.DecisionTreeClassifier(random_state=0),
    'Random Forest': ensemble.RandomForestClassifier(criterion='entropy', n_jobs=-1),
    'Logistic Regression': linear_model.LogisticRegression(max_iter=200),
    'SVM': svm.SVC(probability=True),
    'Ridge': linear_model.RidgeClassifier(),
    'Stochastic Gradient Descent': linear_model.SGDClassifier(),
    'Bagging with forest': ensemble.BaggingClassifier(base_estimator = ensemble.RandomForestClassifier(n_jobs=-1),n_jobs=-1),
    'K-Neighbors': neighbors.KNeighborsClassifier(n_jobs=-1),
    'AdaBoost forest': ensemble.AdaBoostClassifier(base_estimator = ensemble.RandomForestClassifier(n_jobs=-1))
}

In [23]:
def classifyMods(classifier, train_vectors, train_class):
    preds = model_selection.cross_val_predict(classifier, train_vectors, train_class, cv=5, n_jobs=-1)
    accScore = metrics.accuracy_score(train_class,preds)
    labels = [1,-1]
    precision = metrics.precision_score(train_class, preds, average=None,labels=labels)
    recall = metrics.recall_score(train_class,preds,average=None,labels=labels)
    f1score = metrics.f1_score(train_class,preds,average=None,labels=labels)
    return accScore, precision, recall, f1score

In [47]:
calcsO = []
calcsR = []
calcsBoth = []
for index, model in enumerate(models):
    accScore, precision, recall, f1score = classifyMods(models[model], obamaOverSampled, obamaOverSampledLabels)
    calcsO.append({})
    calcsO[index]['Classifier'] = model
    calcsO[index]['Accuracy'] = accScore
    calcsO[index]['Positive Precision'] = precision[0]
    calcsO[index]['Negative Precision'] = precision[1]
    calcsO[index]['Positive Recall'] = recall[0]
    calcsO[index]['Negative Recall'] = recall[1]
    calcsO[index]['Positive F1score'] = f1score[0]
    calcsO[index]['Negative F1score'] = f1score[1]
    
    accScore, precision, recall, f1score = classifyMods(models[model], romneyOverSampled, romneyOverSampledLabels)
    calcsR.append({})
    calcsR[index]['Classifier'] = model
    calcsR[index]['Accuracy'] = accScore
    calcsR[index]['Positive Precision'] = precision[0]
    calcsR[index]['Negative Precision'] = precision[1]
    calcsR[index]['Positive Recall'] = recall[0]
    calcsR[index]['Negative Recall'] = recall[1]
    calcsR[index]['Positive F1score'] = f1score[0]
    calcsR[index]['Negative F1score'] = f1score[1]
    
    accScore, precision, recall, f1score = classifyMods(models[model], bothOverSampled, bothOverSampledLabels)
    calcsBoth.append({})
    calcsBoth[index]['Classifier'] = model
    calcsBoth[index]['Accuracy'] = accScore
    calcsBoth[index]['Positive Precision'] = precision[0]
    calcsBoth[index]['Negative Precision'] = precision[1]
    calcsBoth[index]['Positive Recall'] = recall[0]
    calcsBoth[index]['Negative Recall'] = recall[1]
    calcsBoth[index]['Positive F1score'] = f1score[0]
    calcsBoth[index]['Negative F1score'] = f1score[1]

For Obama, accuracy with Multinomial NB is 0.5801719777440566
For Romney, accuracy with Multinomial NB is 0.6201175250604909
For both, accuracy with Multinomial NB is 0.5810189947198793
For Obama, accuracy with Decision Tree is 0.49418310571573093
For Romney, accuracy with Decision Tree is 0.5915428044705612
For both, accuracy with Decision Tree is 0.5314407186449976
For Obama, accuracy with Random Forest is 0.548811330298432
For Romney, accuracy with Random Forest is 0.7050351422974998
For both, accuracy with Random Forest is 0.6015223205101831
For Obama, accuracy with Logistic Regression is 0.5835440903726185
For Romney, accuracy with Logistic Regression is 0.6559511464454431
For both, accuracy with Logistic Regression is 0.5932935609956799
For Obama, accuracy with SVM is 0.6032709492497049
For Romney, accuracy with SVM is 0.7378730268464109
For both, accuracy with SVM is 0.6604950970307892
For Obama, accuracy with Ridge is 0.5764626538526387
For Romney, accuracy with Ridge is 0.6596

In [48]:
obamaCrossValResult = pd.DataFrame.from_records([s for s in calcsO])
romneyCrossValResult = pd.DataFrame.from_records([s for s in calcsR])
bothCrossValResult = pd.DataFrame.from_records([s for s in calcsBoth])

In [49]:
obamaCrossValResult

Unnamed: 0,Classifier,Accuracy,Positive Precision,Negative Precision,Positive Recall,Negative Recall,Positive F1score,Negative F1score
0,Multinomial NB,0.580172,0.618494,0.576795,0.673242,0.613556,0.644708,0.594608
1,Decision Tree,0.494183,0.556061,0.489775,0.556904,0.472433,0.556482,0.480947
2,Random Forest,0.548811,0.623447,0.53832,0.583713,0.593323,0.602926,0.564485
3,Logistic Regression,0.583544,0.634051,0.583088,0.655539,0.599899,0.644616,0.591374
4,SVM,0.603271,0.686869,0.592731,0.653515,0.643399,0.669777,0.617026
5,Ridge,0.576463,0.623776,0.57712,0.676783,0.571573,0.649199,0.574333
6,Stochastic Gradient Descent,0.575957,0.626999,0.58595,0.674254,0.556904,0.649768,0.571058
7,Bagging with forest,0.553532,0.619273,0.54923,0.594841,0.595346,0.606811,0.571359
8,K-Neighbors,0.523352,0.527506,0.549907,0.708144,0.44866,0.604621,0.49415
9,AdaBoost forest,0.552352,0.606061,0.549368,0.627213,0.593829,0.616455,0.570734


In [50]:
romneyCrossValResult

Unnamed: 0,Classifier,Accuracy,Positive Precision,Negative Precision,Positive Recall,Negative Recall,Positive F1score,Negative F1score
0,Multinomial NB,0.620118,0.66479,0.612636,0.77532,0.563083,0.715813,0.586816
1,Decision Tree,0.591543,0.692602,0.540889,0.68925,0.580712,0.690922,0.560093
2,Random Forest,0.705035,0.797672,0.626961,0.805392,0.745938,0.801514,0.681294
3,Logistic Regression,0.655951,0.716441,0.638522,0.787764,0.585551,0.750412,0.610891
4,SVM,0.737873,0.859358,0.628611,0.823712,0.782233,0.841158,0.697058
5,Ridge,0.659638,0.709967,0.642143,0.81991,0.555133,0.760988,0.595476
6,Stochastic Gradient Descent,0.660214,0.718494,0.643055,0.817836,0.561701,0.764953,0.599631
7,Bagging with forest,0.677843,0.756393,0.608326,0.797442,0.717249,0.776376,0.658312
8,K-Neighbors,0.617698,0.641434,0.694698,0.890425,0.262703,0.745694,0.381239
9,AdaBoost forest,0.743404,0.838854,0.670581,0.840304,0.730384,0.839579,0.699206


In [51]:
bothCrossValResult

Unnamed: 0,Classifier,Accuracy,Positive Precision,Negative Precision,Positive Recall,Negative Recall,Positive F1score,Negative F1score
0,Multinomial NB,0.581019,0.623872,0.56064,0.682781,0.569636,0.651999,0.565102
1,Decision Tree,0.531441,0.584858,0.521281,0.618186,0.519029,0.60106,0.520153
2,Random Forest,0.601522,0.679651,0.56112,0.689159,0.6223,0.684372,0.590129
3,Logistic Regression,0.593294,0.633535,0.575124,0.69101,0.548858,0.661025,0.561684
4,SVM,0.660495,0.752623,0.584962,0.737914,0.683398,0.745196,0.630361
5,Ridge,0.592128,0.630606,0.570995,0.708702,0.541864,0.667377,0.556048
6,Stochastic Gradient Descent,0.588973,0.608808,0.57596,0.725159,0.549064,0.66191,0.562191
7,Bagging with forest,0.589796,0.661344,0.554616,0.690187,0.606871,0.675458,0.579568
8,K-Neighbors,0.552218,0.559902,0.611642,0.800864,0.276692,0.659049,0.38102
9,AdaBoost forest,0.635466,0.714572,0.581504,0.73236,0.642872,0.723357,0.61065


In [32]:
def plotter():
    N = 9
    ind = np.arange(N) 
    width = 0.3
    plt.figure(figsize=(12,8))
    plt.grid(zorder=0)
    xvals = obamaCrossValResult['Accuracy'].tolist()
    bar1 = plt.bar(ind, xvals, width, color = '#4c72b0',zorder=3)
  
    yvals = romneyCrossValResult['Accuracy'].tolist()
    bar2 = plt.bar(ind+width, yvals, width, color='#c44e52',zorder=3)
  
    zvals = bothCrossValResult['Accuracy'].tolist()
    bar3 = plt.bar(ind+width*2, zvals, width, color = '#55a868',zorder=3)

    plt.ylabel('Accuracy')
    plt.title("Model accuracies")
    plt.ylim((0.0, 0.8))
    plt.yticks(np.arange(0.0,0.8,0.05))
    plt.xticks(ind+width, ['MNB', 'Tree', 'Forest', 'LR', 'SVM', 'Ridge', 'SGD', 'Bagging', 'AdaBoost'])
    plt.legend((bar1, bar2, bar3), ('Obama', 'Romney', 'Both'), loc='upper left', bbox_to_anchor=(1.01, 1))
    plt.show()

In [33]:
plotter()

NameError: name 'obamaCrossValResult' is not defined

In [26]:
%%time
eclf = ensemble.VotingClassifier(estimators=[
            ('nbm', models['Multinomial NB']),
            ('rf', models['Random Forest']),
            ('lr', models['Logistic Regression']),
            ('svc', models['SVM']),
            ('rc', models['Ridge']),
            ('sgd', models['Stochastic Gradient Descent']),
            ('bgf', models['Bagging with forest']),
            ('adf', models['AdaBoost forest'])
        ], voting='hard')

eclfR = ensemble.VotingClassifier(estimators=[
            ('nbm', models['Multinomial NB']),
            ('rf', models['Random Forest']),
            ('lr', models['Logistic Regression']),
            ('svc', models['SVM']),
            ('rc', models['Ridge']),
            ('sgd', models['Stochastic Gradient Descent']),
            ('bgf', models['Bagging with forest']),
            ('adf', models['AdaBoost forest'])
        ], voting='hard', n_jobs = -1)

eclf.fit(bothOverSampled, bothOverSampledLabels)
eclfR.fit(romneyOverSampled, romneyOverSampledLabels)

Done first
Wall time: 32min 19s


VotingClassifier(estimators=[('nbm', MultinomialNB()),
                             ('rf',
                              RandomForestClassifier(criterion='entropy',
                                                     n_jobs=-1)),
                             ('lr', LogisticRegression(max_iter=200)),
                             ('svc', SVC(probability=True)),
                             ('rc', RidgeClassifier()),
                             ('sgd', SGDClassifier()),
                             ('bgf',
                              BaggingClassifier(base_estimator=RandomForestClassifier(n_jobs=-1),
                                                n_jobs=-1)),
                             ('adf',
                              AdaBoostClassifier(base_estimator=RandomForestClassifier(n_jobs=-1)))],
                 n_jobs=-1)

## Demo

In [32]:
#Read data
testFileObama = pd.read_excel("final-testData-no-label-Obama-tweets.xlsx", sheet_name='Obama', header=None, names=['Tweet'])
testFileRomney = pd.read_excel("final-testData-no-label-Romney-tweets.xlsx", sheet_name='Romney', header=None, names=['Tweet'])

#Clean data
obamaTestCleaned = preProcessTasks(testFileObama['Tweet'], True)
romneyTestCleaned = preProcessTasks(testFileRomney['Tweet'], True)

#Vectorize data
obamaVector, obamaTestVector = vectorize(trainBothCleaned, obamaTestCleaned)
romneyVector, romneyTestVector = vectorize(trainRomneyCleaned, romneyTestCleaned)

#Predict class
testPredsO = eclf.predict(obamaTestVector.toarray())
testPredsR = eclfR.predict(romneyTestVector.toarray())

print("Input length Obama:", len(testFileObama))
print("Input length Romney:", len(testFileRomney))
print("Prediction length Obama:", len(testPredsO))
print("Prediction length Romney:", len(testPredsR))

Input length Obama: 1951
Input length Romney: 1900
Prediction length Obama: 1951
Prediction length Romney: 1900


TypeError: only integer scalar arrays can be converted to a scalar index

In [38]:
#Predict class
testO = eclf.predict(obamaTestVector.toarray())
testR = eclfR.predict(romneyTestVector.toarray())

In [37]:
with open("obama.txt", "w+") as out:
    for index in range(len(testPredsO)):
        out.write("{};;{}\n".format(index+1, int(testPredsO[index])))
out.close

with open("romney.txt", "w+") as out:
    for index in range(len(testPredsR)):
        out.write("{};;{}\n".format(index+1, int(testPredsR[index])))
out.close

<function TextIOWrapper.close()>