# Logistic Regression

Word Embedding Method : TF-IDF

In [None]:
# libraries

import numpy as np 
import pandas as pd 

import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import nltk
from nltk.corpus import stopwords                  # module for stop words that come with NLTK
from nltk.stem.wordnet import WordNetLemmatizer    # module for lemmatization
from nltk import word_tokenize, pos_tag            # tokenization and Part of Speech tagging

nltk.download('stopwords')
stopwords_english = stopwords.words('english') # English stopwords

import seaborn as sns
import matplotlib.pyplot as plt
import os

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Manipulation

In [None]:
# data

from google.colab import drive
drive.mount('/content/gdrive')

train_data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/train.csv')
test_data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test.csv')
test_label_data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test_labels.csv')

# use only rows that were used for scoring
test_label_data = test_label_data.loc[test_label_data['toxic']!=-1]
test = test_label_data.merge(test_data, on='id', how="inner")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# preprocess the comments

# From a string, make text lowercase, remove hyperlinks, punctuation, word containing numbers, stopwords.
# Input : a list of string
# Output : a list of tokens stored in a generator (yield)

def preprocess(corpus):

    for text in corpus:

        text = text.lower()                                               # Lowercase
        text = re.sub(r'https?://[^\s\n\r]+', '', text)                   # Remove links
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)   # Remove punctuation
        text = re.sub('\w*\d\w*', '', text)                               # Remove words containing numbers
    
        yield ' '.join([word for word in text.split(' ') if word not in stopwords_english]) # Return a generator 

# proprocessed train dataset
clean_comments = list(preprocess(train_data['comment_text']))
# preprocessed test dataset
test_clean_comments = list(preprocess(test['comment_text']))

labels = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

In [None]:
# word embedding : TF-IDF
tfidf_vec = TfidfVectorizer(min_df=1,max_df=0.9)

X_train = tfidf_vec.fit_transform(clean_comments)
X_test = tfidf_vec.transform(test_clean_comments)
Y_train = train_data[labels]

df_classification = pd.DataFrame() 
df_classification['id'] = test['id']
df_classification['comment_text'] = test['comment_text']

### Model

In [None]:
# Build Model and calculate accuracy
scores = []

for label in labels:

    # Model
    LR = LogisticRegression(solver='saga', n_jobs=-1, C=0.5)
    
    # Calculate F1-score
    score = np.mean(cross_val_score(LR, X_train, Y_train[label], cv=3, n_jobs=-1, scoring='accuracy'))
    scores.append(score)
    print("Accuracy for class {} is {}".format(label, score))
    
    LR.fit(X_train, Y_train[label])      
print("Average accuracy: {}".format(np.mean(scores)))

KeyboardInterrupt: ignored

In [None]:
print(X_test.shape)

(63978, 214192)


In [None]:
# Calculate f1 scores

f1_scores = []

for label in labels:

    # Model
    LR = LogisticRegression(solver='saga', n_jobs=-1, C=0.5)
    
    # Calculate F1-score
    f1_score = np.mean(cross_val_score(LR, X_train, Y_train[label], cv=3, n_jobs=-1, scoring='f1'))
    f1_scores.append(f1_score)
    print("F1 score for class {} is {}".format(label, f1_score))

    LR.fit(X_train, Y_train[label])  
    df_classification[label] = LR.predict_proba(X_test)[:,1]
    
print("Average F1 score: {}".format(np.mean(f1_scores)))

F1 score for class toxic is 0.6500233714915494
F1 score for class severe_toxic is 0.2670401910132863
F1 score for class obscene is 0.6779626665264592
F1 score for class threat is 0.07482626248557024
F1 score for class insult is 0.5519481447854053
F1 score for class identity_hate is 0.15913335650561927
Average F1 score: 0.3968223321346483


In [None]:
print(df_classification)
print(df_classification.loc[df_classification['id'] == '005d4e5881163749'])

                     id                                       comment_text  \
0      0001ea8717f6de06  Thank you for understanding. I think very high...   
1      000247e83dcc1211                   :Dear god this site is horrible.   
2      0002f87b16116a7f  "::: Somebody will invariably try to add Relig...   
3      0003e1cccfd5a40a  " \n\n It says it right there that it IS a typ...   
4      00059ace3e3e9a53  " \n\n == Before adding a new product to the l...   
...                 ...                                                ...   
63973  fff8f64043129fa2  :Jerome, I see you never got around to this…! ...   
63974  fff9d70fe0722906  ==Lucky bastard== \n http://wikimediafoundatio...   
63975  fffa8a11c4378854  ==shame on you all!!!== \n\n You want to speak...   
63976  fffac2a094c8e0e2  MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...   
63977  fffb5451268fb5ba  " \n\n == Unicorn lair discovery == \n\n Suppo...   

          toxic  severe_toxic   obscene    threat    insult  id

In [None]:
# create CSV file

df_classification.to_csv("/content/gdrive/My Drive/Colab Notebooks/LR_precisions.csv", index=False)