In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

import numpy as np 
import pandas as pd 

import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import stopwords                  # module for stop words that come with NLTK
from nltk.stem.wordnet import WordNetLemmatizer    # module for lemmatization
from nltk import word_tokenize, pos_tag            # tokenization and Part of Speech tagging

nltk.download('stopwords')
stopwords_english = stopwords.words('english') # English stopwords

import seaborn as sns
import matplotlib.pyplot as plt
import os

Mounted at /content/gdrive
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# data
train = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/train.csv')
test = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test.csv')
test_label = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/test_labels.csv')

# data info
train.info()
train.head(10)

# the % of toxic comments
test_label = test_label.loc[test_label['toxic']!=-1]
test_label.iloc[:,1:-1].sum(axis=0) / test_label.shape[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


toxic           0.095189
severe_toxic    0.005736
obscene         0.057692
threat          0.003298
insult          0.053565
dtype: float64

In [3]:
# preprocess the comments

# From a string, make text lowercase, remove hyperlinks, punctuation, word containing numbers, stopwords.
# Input : a list of string
# Output : a list of tokens stored in a generator (yield)

def preprocess(corpus):

    for text in corpus:

        text = text.lower()                                               # Lowercase
        text = re.sub(r'https?://[^\s\n\r]+', '', text)                   # Remove links
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)   # Remove punctuation
        text = re.sub('\w*\d\w*', '', text)                               # Remove words containing numbers
    
        yield ' '.join([word for word in text.split(' ') if word not in stopwords_english]) # Return a generator 

# clean comments to be fed
clean_comments = list(preprocess(train['comment_text']))

In [4]:
# logistic regression model

labels = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

vectorizer = TfidfVectorizer(analyzer='word',
                            stop_words='english',
                            ngram_range=(1, 3),
                            max_features=30000,
                            sublinear_tf=True)
X_train = vectorizer.fit_transform(train.comment_text)
X_test = vectorizer.transform(test.comment_text)
Y_train = train[labels]

submission = pd.DataFrame.from_dict({'id': test['id']})

# calculate cross validated f1 scores

scores = []

for label in labels:
    #build classifier
    LR = LogisticRegression(solver='saga', n_jobs=-1, C=0.5)
    
    #compute cv score
    cv_score = np.mean(cross_val_score(LR, X_train, Y_train[label], cv=3, n_jobs=-1, scoring='f1'))
    scores.append(cv_score)
    print("F1 score for class {} is {}".format(label, cv_score))
    
    #re-learn & predict
    LR.fit(X_train, Y_train[label])  
    submission[label] = LR.predict_proba(X_test)[:, 1] #predict
    
print("Average Cross Validated F1 scores: {}".format(np.mean(scores)))

F1 score for class toxic is 0.6598846803476125
F1 score for class severe_toxic is 0.23718403484316655
F1 score for class obscene is 0.6928714547173908
F1 score for class threat is 0.0401199152872513
F1 score for class insult is 0.5693808193729556
F1 score for class identity_hate is 0.15608408432807697
Average Cross Validated F1 scores: 0.3925874981494089
