In [14]:
#importing libraries
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import numpy as np
import string
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers

import warnings
warnings.filterwarnings("ignore")

print("Setup complete")

Setup complete


In [15]:
#Dataset preparation
tweets = pd.read_csv("Final dataset 2.csv")
tweets = tweets[['content', 'label']]

df = pd.DataFrame()
df['tweet'] = tweets['content']
df['label'] = tweets['label']
df.head()

Unnamed: 0,tweet,label
0,"['giuliani', 'tax', 'report', 'proves', 'trump...",0
1,"['trump', 'temp', 'crab', 'orchard', 'ky', 'f'...",0
2,"['iconic', 'charcoaler', 'hamburger', 'brand']",1
3,"['new', 'audio', 'clinton', 'refers', 'sanders...",0
4,"['make', 'money', 'sleeping', 'please', 'syeck...",1


In [16]:
#Splitting the dataset into training and testing datasets
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['tweet'], df['label'])

print("Size of the training set: ", len(train_x))
print("Size of the testing set: ", len(test_x))

#Label encoding the target variable
enc = preprocessing.LabelEncoder()
train_y = enc.fit_transform(train_y)
test_y = enc.fit_transform(test_y)

Size of the training set:  13184
Size of the testing set:  4395


In [17]:
#Feature Engineering

In [18]:
#Binary Count Vector as features
binary_count_vect = CountVectorizer(analyzer = 'word', token_pattern = r'\w{1,}', binary = True)
binary_count_vect.fit(df['tweet'])

#Creating a count vectorizer object
count_vect = CountVectorizer(analyzer = 'word', token_pattern = r'\w{1,}')
count_vect.fit(df['tweet'])

#Transforming the training and validation data using count vectorizer object
xtrain_binary_count = binary_count_vect.transform(train_x)
xtest_binary_count = binary_count_vect.transform(test_x)

In [19]:
#Count Vector as features


#Creating a count vectorizer object
count_vect = CountVectorizer(analyzer = 'word', token_pattern = r'\w{1,}')
count_vect.fit(df['tweet'])

#Transforming the training and validation data using count vectorizer object
xtrain_count = count_vect.transform(train_x)
xtest_count = count_vect.transform(test_x)

In [20]:
#TF-IDF as features


#Word-level TF-IDF
tfidf_vect_word = TfidfVectorizer(analyzer = 'word', token_pattern = r'\w{1,}', max_features = 5000)
tfidf_vect_word.fit(df['tweet'])
xtrain_tfidf_word = tfidf_vect_word.transform(train_x)
xtest_tfidf_word = tfidf_vect_word.transform(test_x)

#Character-level TF-IDF
tfidf_vect_char = TfidfVectorizer(analyzer = 'char', token_pattern = r'\w{1,}', ngram_range = (2,3), max_features = 5000)
tfidf_vect_char.fit(df['tweet'])
xtrain_tfidf_char = tfidf_vect_char.transform(train_x)
xtest_tfidf_char = tfidf_vect_char.transform(test_x)

#n-gram-level TF-IDF
tfidf_vect_ngram = TfidfVectorizer(analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (2,3), max_features = 5000)
tfidf_vect_ngram.fit(df['tweet'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram = tfidf_vect_ngram.transform(test_x)

In [21]:
#Model Building

In [22]:
def train_model(classifier, feature_vector_train, label, feature_vector_test):
    #Fitting the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    #Predicting the labels on testing dataset
    train_pred = classifier.predict(feature_vector_train)
    test_pred = classifier.predict(feature_vector_test)
    
    train_acc = metrics.accuracy_score(train_pred, train_y)
    test_acc = metrics.accuracy_score(test_pred, test_y)
    cm = metrics.confusion_matrix(test_y, test_pred)
    """
    print("Training accuracy: ", train_acc)
    print("Testing accuracy: ", test_acc)
    print("Confusion matrix: ", cm)
    """
    return test_acc

#Naive Bayes on Character-level TF-IDF
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_char, train_y, xtest_tfidf_char)
print("NB, Character-level TF-IDF: ", accuracy)

NB, Character-level TF-IDF:  0.8714448236632537


In [23]:
#Naive Bayes on Binary Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_binary_count, train_y, xtest_binary_count)
print("NB, Binary Count Vectors: ", accuracy)

#Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count)
print("NB, Count Vectors: ", accuracy)

#Naive Bayes on Word-level TF-IDF
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_word, train_y, xtest_tfidf_word)
print("NB, Word-level TF-IDF: ", accuracy)

#Naive Bayes on Character-level TF-IDF
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_char, train_y, xtest_tfidf_char)
print("NB, Character-level TF-IDF: ", accuracy)

#Naive Bayes on n-gram TF-IDF
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("NB, n-gram TF-IDF: ", accuracy)

NB, Binary Count Vectors:  0.8687144482366326
NB, Count Vectors:  0.8696245733788396
NB, Word-level TF-IDF:  0.855745164960182
NB, Character-level TF-IDF:  0.8714448236632537
NB, n-gram TF-IDF:  0.7754266211604095


In [24]:
#Logistic Regression on Binary Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_binary_count, train_y, xtest_binary_count)
print("LR, Binary Count Vectors: ", accuracy)

#Logistic Regression on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count)
print("LR, Count Vectors: ", accuracy)

#Logistic Regression on Word-level TF-IDF
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_word, train_y, xtest_tfidf_word)
print("LR, Word-level TF-IDF: ", accuracy)

#Logistic Regression on Character-level TF-IDF
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_char, train_y, xtest_tfidf_char)
print("LR, Character-level TF-IDF: ", accuracy)

#Logistic Regression on n-gram TF-IDF
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("LR, n-gram TF-IDF: ", accuracy)

LR, Binary Count Vectors:  0.8994311717861206
LR, Count Vectors:  0.8987485779294653
LR, Word-level TF-IDF:  0.8937428896473265
LR, Character-level TF-IDF:  0.8939704209328783
LR, n-gram TF-IDF:  0.8061433447098976


In [25]:
#Support Vector Machine

from sklearn.svm import SVC

#Support Vector Machine on Binary Count Vectors
accuracy = train_model(svm.SVC(kernel = 'linear', C = 10), xtrain_binary_count, train_y, xtest_binary_count)
print("SVM, Binary Count Vectors: ", accuracy)

#Support Vector Machine on Count Vectors
accuracy = train_model(svm.SVC(kernel = 'linear', C = 10), xtrain_count, train_y, xtest_count)
print("SVM, Count Vectors: ", accuracy)

#Support Vector Machine on Word-level TF-IDF
accuracy = train_model(svm.SVC(kernel = 'linear', C = 10), xtrain_tfidf_word, train_y, xtest_tfidf_word)
print("SVM, Word-level TF-IDF: ", accuracy)

#Support Vector Machine on Character-level TF-IDF
accuracy = train_model(svm.SVC(kernel = 'linear', C = 10), xtrain_tfidf_char, train_y, xtest_tfidf_char)
print("SVM, Character-level TF-IDF: ", accuracy)

#Support Vector Machine on n-gram TF-IDF
accuracy = train_model(svm.SVC(kernel = 'linear', C = 10), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("SVM, n-gram TF-IDF: ", accuracy)

SVM, Binary Count Vectors:  0.8234357224118316
SVM, Count Vectors:  0.825938566552901
SVM, Word-level TF-IDF:  0.8573378839590444


KeyboardInterrupt: 

In [None]:
#Random Forest

#Random Forest on Binary Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_binary_count, train_y, xtest_binary_count)
print("RF, Binary Count Vectors: ", accuracy)

#Support Vector Machine on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xtest_count)
print("RF, Count Vectors: ", accuracy)

#Support Vector Machine on Word-level TF-IDF
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_word, train_y, xtest_tfidf_word)
print("RF, Word-level TF-IDF: ", accuracy)

#Support Vector Machine on Character-level TF-IDF
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_char, train_y, xtest_tfidf_char)
print("RF, Character-level TF-IDF: ", accuracy)

#Support Vector Machine on n-gram TF-IDF
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("RF, n-gram TF-IDF: ", accuracy)