# Logistic Regression

Word Embedding Method : TF-IDF

In [1]:
# libraries

import numpy as np 
import pandas as pd 

import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import nltk
from nltk.corpus import stopwords                  # module for stop words that come with NLTK
from nltk.stem.wordnet import WordNetLemmatizer    # module for lemmatization
from nltk import word_tokenize, pos_tag            # tokenization and Part of Speech tagging

nltk.download('stopwords')
stopwords_english = stopwords.words('english') # English stopwords

import seaborn as sns
import matplotlib.pyplot as plt
import os

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Data Manipulation

In [2]:
# data

from google.colab import drive
drive.mount('/content/gdrive')

train_data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/train.csv')
test_data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test.csv')
test_label_data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test_labels.csv')

# use only rows that were used for scoring
test_label_data = test_label_data.loc[test_label_data['toxic']!=-1]
test = test_label_data.merge(test_data, on='id', how="inner")

Mounted at /content/gdrive


In [3]:
# preprocess the comments

# From a string, make text lowercase, remove hyperlinks, punctuation, word containing numbers, stopwords.
# Input : a list of string
# Output : a list of tokens stored in a generator (yield)

def preprocess(corpus):

    for text in corpus:

        text = text.lower()                                               # Lowercase
        text = re.sub(r'https?://[^\s\n\r]+', '', text)                   # Remove links
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)   # Remove punctuation
        text = re.sub('\w*\d\w*', '', text)                               # Remove words containing numbers
    
        yield ' '.join([word for word in text.split(' ') if word not in stopwords_english]) # Return a generator 

# proprocessed train dataset
clean_comments = list(preprocess(train_data['comment_text']))
# preprocessed test dataset
test_clean_comments = list(preprocess(test['comment_text']))

labels = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

In [4]:
# word embedding : TF-IDF
tfidf_vec = TfidfVectorizer(min_df=1,max_df=0.9)

X_train = tfidf_vec.fit_transform(clean_comments)
X_test = tfidf_vec.transform(test_clean_comments)
Y_train = train_data[labels]

df_classification = pd.DataFrame() 
df_classification['id'] = test['id']
df_classification['comment_text'] = test['comment_text']

### Model

In [22]:
# Build Model and calculate accuracy
scores = []

for label in labels:

    # Model
    LR = LogisticRegression(solver='saga', n_jobs=-1, C=0.5)
    
    # Calculate F1-score
    score = np.mean(cross_val_score(LR, X_train, Y_train[label], cv=3, n_jobs=-1, scoring='accuracy'))
    scores.append(score)
    print("Accuracy for class {} is {}".format(label, score))
    
    LR.fit(X_train, Y_train[label])  
    df_classification[label] = LR.predict_proba(X_test)
    
print("Average accuracy: {}".format(np.mean(scores)))

Accuracy for class toxic is 0.9488628905011796
Accuracy for class severe_toxic is 0.9904368592197064
Accuracy for class obscene is 0.9733096874590889
Accuracy for class threat is 0.9970796704144117
Accuracy for class insult is 0.9670679556928617
Accuracy for class identity_hate is 0.9916212849613188
Average accuracy: 0.9780630580414278


In [None]:
# Calculate f1 scores

f1_scores = []

for label in labels:

    # Model
    LR = LogisticRegression(solver='saga', n_jobs=-1, C=0.5)
    
    # Calculate F1-score
    f1_score = np.mean(cross_val_score(LR, X_train, Y_train[label], cv=3, n_jobs=-1, scoring='f1'))
    f1_scores.append(f1_score)
    print("F1 score for class {} is {}".format(label, f1_score))
    
print("Average F1 score: {}".format(np.mean(f1_scores)))

F1 score for class toxic is 0.6499655983851421
F1 score for class severe_toxic is 0.2670401910132863
F1 score for class obscene is 0.6779626665264592
F1 score for class threat is 0.07482626248557024
F1 score for class insult is 0.5520726719300045
F1 score for class identity_hate is 0.15913335650561927
Average F1 score: 0.39683345780768037


In [24]:
# create CSV file

df_classification.to_csv("/content/gdrive/My Drive/Colab Notebooks/LR_precisions.csv", index=False)