# Logistic Regression 

Word Embedding Method 1 : BOW

Word Embedding **Method** 2 : TF-IDF

In [None]:
# libraries

import pandas as pd
import numpy as np
import re
import string
import itertools as it
import pickle
import os
from  pathlib import Path

import nltk
from nltk.corpus import stopwords                  
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# English stopwords
nltk.download('stopwords')
stopwords_english = stopwords.words('english')

### Data Manipulation

In [None]:
# data

from google.colab import drive
drive.mount('/content/gdrive')

train_data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/train.csv')
test_data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test.csv')
test_label_data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test_labels.csv')

# use only rows that were used for scoring
test_label_data = test_label_data.loc[test_label_data['toxic']!=-1]
test = test_label_data.merge(test_data, on='id', how="inner")

In [77]:
# preprocess the comments

# From a string, make text lowercase, remove hyperlinks, punctuation, word containing numbers, stopwords.
# Input : a list of string
# Output : a list of tokens stored in a generator (yield)

def preprocess(corpus):

    for text in corpus:

        text = text.lower()                                               # Lowercase
        text = re.sub(r'https?://[^\s\n\r]+', '', text)                   # Remove links
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)   # Remove punctuation
        text = re.sub('\w*\d\w*', '', text)                               # Remove words containing numbers
    
        yield ' '.join([word for word in text.split(' ') if word not in stopwords_english]) # Return a generator 

# proprocessed train dataset
clean_comments = list(preprocess(train_data['comment_text']))
# preprocess test dataset
test_clean_comments = list(preprocess(test['comment_text']))

# classification labels
label = train_data[['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']]

### Model

In [78]:
# function that returns vector of Naive Bayes probabilities with smoothing (n_words,1)

def probNB(bow,label,cat):

    p = np.array(bow[label==cat].sum(axis=0))

    return np.transpose((p+1) / (p.sum() + bow.shape[1]))

In [79]:
# function that returns the log likelihood of a document (n_doc,1)

def get_model(bow,label):

    log = np.log(probNB(bow,label,1)/probNB(bow,label,0))
    m = bow.dot(log)
    model = LogisticRegression().fit(m,label)
    return model, log

### 1. BOW + NB + Logistic Regression

In [80]:
# word embedding method 1 : Bag of Words

# Filter words that appear more than 30% but less than 90% of the document
vectorizer = CountVectorizer(min_df=3,max_df=0.9) 

# BOW for train dataset
bow = vectorizer.fit_transform(clean_comments)
# BOW for test dataset
bow_test = vectorizer.transform(test_clean_comments) 

In [96]:
# Create model and calculate accuracy

df_classification = pd.DataFrame() 
df_classification['Comments'] = test['comment_text']

for i,j in enumerate(label.columns):
    model,log = get_model(bow,label[j].values)
    df_classification[j] = model.predict(bow_test.dot(log))

    # calculate accuracy
    score = model.score(bow_test.dot(log) , test[j])
    print("Accuracy for class {} is {}".format(j, score))

Accuracy for class toxic is 0.9278970896245584
Accuracy for class severe_toxic is 0.9937791115696021
Accuracy for class obscene is 0.9506549126262153
Accuracy for class threat is 0.9960767763918847
Accuracy for class insult is 0.9502641533026978
Accuracy for class identity_hate is 0.9884647847697646


In [93]:
# calculate F1-score

for i,j in enumerate(label.columns):

  model,log = get_model(bow,label[j].values)
  df_classification[j] = model.predict(bow_test.dot(log))

  f1_score = metrics.f1_score(df_classification[j], test[j])
  print("F1 score for class {} is {}".format(j, f1_score))

F1 score for class toxic is 0.5539978729575559
F1 score for class severe_toxic is 0.16736401673640167
F1 score for class obscene is 0.3443406022845275
F1 score for class threat is 0.03088803088803089
F1 score for class insult is 0.22841901066925313
F1 score for class identity_hate is 0.044041450777202076


### 2. TF-IDF + NB +Logistic regression

In [None]:
# word embedding method 2 : TF-IDF 

# Filter words that appear more than 30% but less than 90% of the document
tfidf_vec = TfidfVectorizer(min_df=1,max_df=0.9)

# TF-IDF for test
tfidf = tfidf_vec.fit_transform(clean_comments)
# TF-IDF for train
tfidf_test = tfidf_vec.transform(test_clean_comments)

In [97]:
# create model and calcuate accuracy

df_classification = pd.DataFrame()
df_classification['Comments'] = test['comment_text']

for i,j in enumerate(label.columns):
    model,log = get_model(tfidf,label[j].values)
    df_classification[j] = model.predict(tfidf_test.dot(log))

    # Accuracy
    score = model.score(tfidf_test.dot(log) , test[j])
    print("Accuracy for class {} is {}".format(j, score))  

Accuracy for class toxic is 0.9161743099190347
Accuracy for class severe_toxic is 0.994248022757823
Accuracy for class obscene is 0.9481853137015849
Accuracy for class threat is 0.9967019913095126
Accuracy for class insult is 0.9470755572227954
Accuracy for class identity_hate is 0.9888711744662227


In [98]:
# calculate F1-score

for i,j in enumerate(label.columns):

  model,log = get_model(tfidf,label[j].values)
  df_classification[j] = model.predict(tfidf_test.dot(log))

  f1_score = metrics.f1_score(df_classification[j], test[j])
  print("F1 score for class {} is {}".format(j, f1_score))

F1 score for class toxic is 0.5187112985730952
F1 score for class severe_toxic is 0.0
F1 score for class obscene is 0.4075067024128687
F1 score for class threat is 0.0
F1 score for class insult is 0.2835378755818874
F1 score for class identity_hate is 0.0
