# Structure Based Hate Speech Detection

In [118]:
import csv
import nltk
import re
import numpy as np
from tqdm import tqdm

In [119]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from nltk.corpus import stopwords

In [120]:
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import pandas as pd

In [121]:
DATA_PATH = "../data/"

In [122]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter=',')]
    return data

def getTweets(raw):
    #pass
    data = [x[6] for x in raw]
    return np.array(data)

def getClass(raw):
    #pass
    classes = [x[5] for x in raw]
    return np.array(classes)

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for tweet in data:
        tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        tweet = tokenize(tweet)
#         print(tweet)
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
#     print(text)
    return text.split()
    #return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
#     print("F1 score:   ", f1)
#     print("Avg Recall: ", rec)    
#     print("Accuracy:   ", acc*100)
    return f1,acc*100,rec

In [123]:
DATA = DATA_PATH + "labeled_data.csv"

In [124]:
en_stopwords = set(stopwords.words("english")) 

raw = readData(DATA) 
r_tweets = getTweets(raw)
r_tweets = r_tweets[1:len(r_tweets)]
classes = getClass(raw)
classes = classes[1:len(classes)]
tweets = preprocess(r_tweets)

In [8]:
##for pos part
fd = pd.read_csv("../data/labeled_data.csv")
all_tweets = fd.tweet


In [8]:
# X = [x for x in r_tweets]
# X = np.delete(np.array(X), [0])
# y = np.delete(classes, [0])
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# X_train = np.array(X_train)

In [125]:
data = []
for t in tweets:
    x = ' '.join(t)
    data.append(x)
Y = classes
X_train, X_test, y_train, y_test = train_test_split(data, Y, test_size=0.2)

In [81]:
import warnings
warnings.filterwarnings("ignore")


### Logistic Regression

In [12]:
def LR(train_features,test_features,y_train,y_test,string):
    classifier = LogisticRegression(random_state=0, solver='lbfgs')   
    classifier.fit(train_features, y_train)
    y_predict = classifier.predict(test_features)
    f1,acc,rec=evaluate(y_test, y_predict)
    ans['model'].append(string)
    ans['F1-score'].append(f1)
    ans['Recall'].append(rec)
    ans['Accuracy'].append(acc)

### SVM

In [13]:
def SVM(train_features,test_features,y_train,y_test,string):
    classifier = SVC(C = 1,kernel='rbf')
    classifier.fit(train_features, y_train)
    y_predict = classifier.predict(test_features)
    f1,acc,rec=evaluate(y_test, y_predict)
    ans['model'].append(string)
    ans['F1-score'].append(f1)
    ans['Recall'].append(rec)
    ans['Accuracy'].append(acc)

## Word-level Tokens with CountVectorizer

In [24]:
params = [(1,1),(1,2),(1,3)]
keys = ['unigram','bigram','trigram']
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []
for p in range(len(params)):
    vectorizer = CountVectorizer(
        analyzer = 'word',
        lowercase = True,
        tokenizer = tokenize,
        ngram_range=params[p],
        stop_words = en_stopwords)
    vectorizer.fit(X_train)
    train_features = vectorizer.transform(X_train)
    test_features = vectorizer.transform(X_test)
    string = keys[p]+' '+'using LR'
    LR(train_features,test_features,y_train,y_test,string)
    string = keys[p]+' '+'using SVM'
    SVM(train_features,test_features,y_train,y_test,string)

In [26]:
import operator
df = pd.DataFrame(ans)
df

Unnamed: 0,Accuracy,F1-score,Recall,model
0,88.380069,0.871655,0.650787,unigram using LR
1,76.538229,0.663664,0.333333,unigram using SVM
2,88.117813,0.866663,0.63488,bigram using LR
3,76.538229,0.663664,0.333333,bigram using SVM
4,87.936252,0.86342,0.62425,trigram using LR
5,76.538229,0.663664,0.333333,trigram using SVM


## Char-level Tokens with CountVectorizer

In [126]:
vectorizer = CountVectorizer(
    analyzer = 'char',
    lowercase = True,
    tokenizer = tokenize,
    ngram_range=(2, 6),
    stop_words = en_stopwords)
vectorizer.fit(X_train)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)

In [130]:
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []
string = 'Char level using LR'
LR(train_features,test_features,y_train,y_test,string)


In [131]:
string = 'Char level using SVM'
SVM(train_features,test_features,y_train,y_test,string)

In [132]:
df = pd.DataFrame(ans)
df

Unnamed: 0,Accuracy,F1-score,Recall,model
0,89.792213,0.890695,0.70755,Char level using LR
1,76.861005,0.668052,0.333333,Char level using SVM


## DL Models

## Using Torchtext

In [83]:
from torchtext.data import Field, TabularDataset
def cust_preprocess(tweet):
    tweet = removePattern(tweet, "@[\w]*")
    tweet = tweet.replace("#", "") # Removing '#' from hashtags
    tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
    tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
    tweet = re.sub(" +", " ", tweet)
    tweet = tweet.lower()
    tweet = tokenize(tweet)
    return tweet

In [84]:
TEXT = Field(sequential = True, tokenize = cust_preprocess, lower=True)
LABEL = Field(sequential = False, use_vocab=False)

In [85]:
tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("ct", None),
                 ("count", None),
                 ("hate_speech", LABEL),
                 ("offensive", LABEL),
                 ("neither", LABEL),
                 ("label", None),
                 ("tweet", TEXT)]

dt = TabularDataset(
               path=DATA_PATH + "labeled_data-mod.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)
trn, dev,tst = dt.split([0.8,0.1,0.1])

In [86]:
TEXT.build_vocab(trn)

In [87]:
from torchtext.data import Iterator, BucketIterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, dev_iter = BucketIterator.splits(
     (trn, dev), # we pass in the datasets we want the iterator to draw data from
     batch_sizes=(64, 64),
     device=device, # if you want to use the GPU, specify the GPU number here
     sort_key=lambda x: len(x.tweet), # the BucketIterator needs to be told what function it should use to group the data.
     sort_within_batch=False,
     repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False)

In [88]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y

    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [89]:
train_dl = BatchWrapper(train_iter, "tweet", ["hate_speech","offensive","neither"])
test_dl = BatchWrapper(test_iter, "tweet", ["hate_speech","offensive","neither"])
dev_dl = BatchWrapper(dev_iter, "tweet", ["hate_speech","offensive","neither"])

In [90]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [91]:
class LSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 3)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

## LSTM 128-dim embedding

In [92]:
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []

In [93]:
em_sz = 128
nh = 250
nl = 3
model = LSTMBaseline(nh, emb_dim=em_sz)
model.to(device)

LSTMBaseline(
  (embedding): Embedding(13794, 128)
  (encoder): LSTM(128, 250, dropout=0.1)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=250, out_features=3, bias=True)
)

In [94]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()
epochs = 10
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x,y in tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()

        running_loss += loss.data.item() * x.size(0)

    epoch_loss = running_loss / len(trn)

    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x,y in tqdm(dev_dl):
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data.item() * x.size(0)

    val_loss /= len(dev)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

100%|██████████| 310/310 [00:50<00:00,  6.13it/s]
100%|██████████| 39/39 [00:00<00:00, 69.26it/s] 
  0%|          | 1/310 [00:00<00:54,  5.69it/s]

Epoch: 1, Training Loss: 0.1853, Validation Loss: 0.0927


100%|██████████| 310/310 [00:41<00:00,  7.43it/s]
100%|██████████| 39/39 [00:00<00:00, 69.63it/s] 
  0%|          | 1/310 [00:00<00:52,  5.87it/s]

Epoch: 2, Training Loss: 0.1839, Validation Loss: 0.0951


100%|██████████| 310/310 [00:47<00:00,  6.48it/s]
100%|██████████| 39/39 [00:00<00:00, 68.11it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 0.1555, Validation Loss: 0.0532


100%|██████████| 310/310 [01:01<00:00,  5.01it/s]
100%|██████████| 39/39 [00:00<00:00, 69.13it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 0.0945, Validation Loss: 0.0487


100%|██████████| 310/310 [01:04<00:00,  4.78it/s]
100%|██████████| 39/39 [00:00<00:00, 68.26it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 5, Training Loss: 0.0816, Validation Loss: 0.0503


100%|██████████| 310/310 [01:18<00:00,  3.95it/s]
100%|██████████| 39/39 [00:00<00:00, 53.90it/s]
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 6, Training Loss: 0.0734, Validation Loss: 0.0562


100%|██████████| 310/310 [01:44<00:00,  2.97it/s]
100%|██████████| 39/39 [00:00<00:00, 51.15it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 7, Training Loss: 0.0674, Validation Loss: 0.0558


100%|██████████| 310/310 [01:51<00:00,  2.78it/s]
100%|██████████| 39/39 [00:01<00:00, 37.41it/s]
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 8, Training Loss: 0.0622, Validation Loss: 0.0547


100%|██████████| 310/310 [01:52<00:00,  2.76it/s]
100%|██████████| 39/39 [00:00<00:00, 81.92it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 9, Training Loss: 0.0589, Validation Loss: 0.0551


100%|██████████| 310/310 [01:22<00:00,  3.75it/s]
100%|██████████| 39/39 [00:00<00:00, 54.14it/s] 

Epoch: 10, Training Loss: 0.0549, Validation Loss: 0.0594





In [95]:
# test_iter.
with torch.no_grad():
    test_preds = []
    ground_truth = []
    for x,y in tqdm(test_dl):
        preds = model(x)
        preds = F.softmax(preds)
        preds = preds.cpu()
        preds = preds.data.numpy()
        for result in preds:
            if np.argmax(result) == 0:
                test_preds.append([1, 0, 0])
            elif np.argmax(result) == 1:
                test_preds.append([0, 1, 0])
            elif np.argmax(result) == 2:
                test_preds.append([0, 0, 1])
        for val in y:
            ground_truth.append(val.cpu().data.numpy())

100%|██████████| 39/39 [00:01<00:00, 26.28it/s]


In [97]:
test_preds = np.array(test_preds)
ground_truth = np.array(ground_truth)
f1,acc,re = evaluate(ground_truth, test_preds)
string = 'LSTM 128-dim embedding'
ans['model'].append(string)
ans['F1-score'].append(f1)
ans['Recall'].append(re)
ans['Accuracy'].append(acc)

## LSTM 256-dim embedding

In [98]:
em_sz = 256
nh = 250
nl = 3
model = LSTMBaseline(nh, emb_dim=em_sz)
model.to(device)

LSTMBaseline(
  (embedding): Embedding(13794, 256)
  (encoder): LSTM(256, 250, dropout=0.1)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=250, out_features=3, bias=True)
)

In [99]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()
epochs = 10
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x,y in tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()

        running_loss += loss.data.item() * x.size(0)

    epoch_loss = running_loss / len(trn)

    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x,y in tqdm(dev_dl):
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data.item() * x.size(0)

    val_loss /= len(dev)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

100%|██████████| 310/310 [03:16<00:00,  1.57it/s]
100%|██████████| 39/39 [00:00<00:00, 74.20it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.1856, Validation Loss: 0.0941


100%|██████████| 310/310 [03:22<00:00,  1.53it/s]
100%|██████████| 39/39 [00:00<00:00, 72.46it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.1862, Validation Loss: 0.0996


100%|██████████| 310/310 [02:51<00:00,  1.80it/s]
100%|██████████| 39/39 [00:00<00:00, 74.74it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 0.1667, Validation Loss: 0.0648


100%|██████████| 310/310 [02:11<00:00,  2.36it/s]
100%|██████████| 39/39 [00:00<00:00, 62.41it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 0.1029, Validation Loss: 0.0577


100%|██████████| 310/310 [02:32<00:00,  2.03it/s]
100%|██████████| 39/39 [00:00<00:00, 51.12it/s]
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 5, Training Loss: 0.0835, Validation Loss: 0.0548


100%|██████████| 310/310 [02:42<00:00,  1.91it/s]
100%|██████████| 39/39 [00:00<00:00, 58.30it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 6, Training Loss: 0.0756, Validation Loss: 0.0549


100%|██████████| 310/310 [02:59<00:00,  1.72it/s]
100%|██████████| 39/39 [00:00<00:00, 69.20it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 7, Training Loss: 0.0704, Validation Loss: 0.0610


100%|██████████| 310/310 [02:56<00:00,  1.76it/s]
100%|██████████| 39/39 [00:00<00:00, 72.95it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 8, Training Loss: 0.0648, Validation Loss: 0.0600


100%|██████████| 310/310 [03:00<00:00,  1.72it/s]
100%|██████████| 39/39 [00:00<00:00, 54.58it/s] 
  0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 9, Training Loss: 0.0609, Validation Loss: 0.0607


100%|██████████| 310/310 [03:00<00:00,  1.72it/s]
100%|██████████| 39/39 [00:00<00:00, 64.12it/s] 

Epoch: 10, Training Loss: 0.0561, Validation Loss: 0.0655





In [100]:
# test_iter.
with torch.no_grad():
    test_preds = []
    ground_truth = []
    for x,y in tqdm(test_dl):
        preds = model(x)
        preds = F.softmax(preds)
        preds = preds.cpu()
        preds = preds.data.numpy()
        for result in preds:
            if np.argmax(result) == 0:
                test_preds.append([1, 0, 0])
            elif np.argmax(result) == 1:
                test_preds.append([0, 1, 0])
            elif np.argmax(result) == 2:
                test_preds.append([0, 0, 1])
        for val in y:
            ground_truth.append(val.cpu().data.numpy())

100%|██████████| 39/39 [00:01<00:00, 23.15it/s]


In [102]:
test_preds = np.array(test_preds)
ground_truth = np.array(ground_truth)
f1,acc,rec = evaluate(ground_truth, test_preds)
ans['model'].append(string)
ans['F1-score'].append(f1)
ans['Recall'].append(rec)
ans['Accuracy'].append(acc)

In [108]:
ans['model'] = []
string = 'LSTM with 256-dim embedding'
ans['model'].append('LSTM with 128-dim embedding')
ans['model'].append(string)
ans['F1-score'] = ans['F1-score'][0:2]
df = pd.DataFrame(ans)
df

Unnamed: 0,Accuracy,F1-score,Recall,model
0,88.902341,0.882461,0.683782,LSTM with 128-dim embedding
1,87.691687,0.882461,0.641605,LSTM with 256-dim embedding


### Using surface features,linguistic features and sentiment analysis

In [34]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *
from nltk.stem.porter import *
from sklearn.linear_model import LogisticRegression

In [35]:
other_exclusions = ["#ff", "ff", "rt"]
stopwords = nltk.corpus.stopwords.words("english")
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()
def preprocessing(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()



In [36]:
y = fd['class'].astype(int)
ans = {}

In [37]:
def LR_l2(X_train,X_test,y_train,y_test,string):
    LR2 = LogisticRegression(random_state = 0,class_weight='balanced',penalty='l2',solver='lbfgs')
    LR2.fit(X_train, y_train)
    y_predict = LR2.predict(X_test)
    f1,acc,rec=evaluate(y_test, y_predict)
    ans['model'].append(string)
    ans['F1-score'].append(f1)
    ans['Recall'].append(rec)
    ans['Accuracy'].append(acc)



In [38]:
def LR_l1(X_train,X_test,y_train,y_test,string):
    LR1 = LogisticRegression(random_state = 0,class_weight='balanced',penalty="l1", C=0.1)
    LR1.fit(X_train, y_train)
    y_predict = LR1.predict(X_test)
    f1,acc,rec = evaluate(y_test, y_predict)
    ans['model'].append(string)
    ans['F1-score'].append(f1)
    ans['Recall'].append(rec)
    ans['Accuracy'].append(acc)


### n-grams each weighted by Tf-Idf

In [39]:
def tf_idf_gram(param,string):
    vectorizer = TfidfVectorizer(
        tokenizer=tokenize,
        preprocessor=preprocessing,
        ngram_range=(param),
        stop_words=stopwords,
        use_idf=True,
        smooth_idf=False,
        norm=None,
        decode_error='replace',
        max_features=10000,
        min_df=5,
        max_df=0.75
        )
    #Construct tfidf matrix and get relevant scores
    tfidf = vectorizer.fit_transform(all_tweets).toarray()
    vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
    idf_vals = vectorizer.idf_
    idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF score
    X = pd.DataFrame(tfidf)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    string1 = 'LR with L1 norm on ' + string
    LR_l1(X_train,X_test,y_train,y_test,string1)
    string2  = 'LR with L2 norm on ' + string
    LR_l2(X_train,X_test,y_train,y_test,string2)
    return tfidf

In [40]:
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []
tfidf1=tf_idf_gram((1,1),'unigram')
tfidf2=tf_idf_gram((1,2),'digram')
tfidf3=tf_idf_gram((1,3),'trigram')

In [41]:
df = pd.DataFrame(ans)
df

Unnamed: 0,Accuracy,F1-score,Recall,model
0,89.570305,0.901749,0.805396,LR with L1 norm on unigram
1,84.526932,0.856532,0.708818,LR with L2 norm on unigram
2,89.711519,0.90208,0.803824,LR with L1 norm on digram
3,85.676821,0.864366,0.70808,LR with L2 norm on digram
4,89.02562,0.894869,0.773262,LR with L1 norm on trigram
5,84.728667,0.854472,0.691336,LR with L2 norm on trigram


### POS tags weighted by Tf-Idf

In [42]:
tweet_tags = []
for t in all_tweets:
    tokens = basic_tokenize(preprocessing(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)

In [43]:
def tf_idf_pos_tags(param,string):
    pos_vectorizer = TfidfVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=param,
    stop_words=None,
    use_idf=False,
    smooth_idf=False,
    norm=None,
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.75,
    )
    pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
    X = pd.DataFrame(pos)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    string1 = 'LR with L1 norm on ' + string
    LR_l1(X_train,X_test,y_train,y_test,string1)
    string2  = 'LR with L2 norm on ' + string
    LR_l2(X_train,X_test,y_train,y_test,string2)
    return pos

In [44]:
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []
tfidf_pos1=tf_idf_pos_tags((1,1),'unigram')
tfidf_pos2=tf_idf_pos_tags((1,2),'digram')
tfidf_pos3=tf_idf_pos_tags((1,3),'trigram')


In [45]:
df = pd.DataFrame(ans)
df

Unnamed: 0,Accuracy,F1-score,Recall,model
0,75.186605,0.688983,0.357679,LR with L1 norm on unigram
1,47.791003,0.536009,0.398268,LR with L2 norm on unigram
2,75.892677,0.712052,0.376555,LR with L1 norm on digram
3,50.877547,0.569286,0.435831,LR with L2 norm on digram
4,75.912851,0.717261,0.382188,LR with L1 norm on trigram
5,55.275368,0.602912,0.412881,LR with L2 norm on trigram


### Sentiment Analysis

In [46]:
#Now get other features
sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocessing(tweet) #Get text only
    
    syllables = textstat.syllable_count(words)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet)
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [47]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", \
                        "vader compound", "num_hashtags", "num_mentions", "num_urls", "is_retweet"]

In [48]:
feats = get_feature_array(all_tweets)

In [52]:
X = pd.DataFrame(feats)
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
string1 = 'LR with L1 norm'
LR_l1(X_train,X_test,y_train,y_test,string1)
string2  = 'LR with L2 norm'
LR_l2(X_train,X_test,y_train,y_test,string2)


In [53]:
df = pd.DataFrame(ans)
df

Unnamed: 0,Accuracy,F1-score,Recall,model
0,75.912851,0.748859,0.488027,LR with L1 norm
1,63.445632,0.669407,0.542719,LR with L2 norm


### Using all the three features together

In [57]:
M = np.concatenate([tfidf2,tfidf_pos2,feats],axis=1)

In [75]:
ans = {}
ans['model'] = []
ans['F1-score'] = []
ans['Recall'] = []
ans['Accuracy'] = []

In [76]:
X = pd.DataFrame(M)
y = fd['class'].astype(int)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [78]:
string = "LR_l1 using basic features"
LR_l1(X_train,X_test,y_train,y_test,string)

In [79]:
string = "LR_l2 using basic features"
LR_l2(X_train,X_test,y_train,y_test,string)

In [80]:
df = pd.DataFrame(ans)
df

Unnamed: 0,Accuracy,F1-score,Recall,model
0,89.913254,0.902852,0.790027,LR_l1 using basic features
1,88.319548,0.891135,0.782296,LR_l2 using basic features
