## Step 1. EDA

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/train.csv', nrows=1000,
                 dtype={'toxic':'int8',
                        'severe_toxic':'int8',
                        'obscene':'int8', 
                        'threat':'int8',
                        'insult':'int8',
                        'identity_hate':'int8'}).sample(frac=1, random_state=66136).reset_index(drop=True)

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,003a19c04c079bf7,"LACK OF BALANCE\r\n\r\nThis article is seriously out of balance. It would benefit greatly from the introduction of additional information about the negative aspects of the T-34. Just for the record, this sort of balance problem is not uncommon when there is an over dependence on the works of a particular author. For example, testing and evaluation of a T-34 by the US Army Ordnance Dept. exposed some very serious problems with the tank. This type of information should be included in the article to bring it back into balance.",0,0,0,0,0,0
1,001ffdcc3e7fb49c,Awesome! Then I'll simply disregard your notice. Thanks!,0,0,0,0,0,0
2,0294749a6add04c9,"Feather \r\n\r\nThere is no citation as to how a quill looked in the past. Therefore, it is opinion only.",0,0,0,0,0,0
3,009a3333aa4ac011,|listas = Sabina of Bavaria,0,0,0,0,0,0
4,00a216c00b90ce88,"Gore response \r\n\r\nCan anybody find Gore's response to Bush's malaprop? Why didn't Gore, as the inventor of the darn thing, club him silly after that one?",0,0,0,0,0,0


In [5]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             1000 non-null   object
 1   comment_text   1000 non-null   object
 2   toxic          1000 non-null   int8  
 3   severe_toxic   1000 non-null   int8  
 4   obscene        1000 non-null   int8  
 5   threat         1000 non-null   int8  
 6   insult         1000 non-null   int8  
 7   identity_hate  1000 non-null   int8  
dtypes: int8(6), object(2)
memory usage: 582.3 KB


In [6]:
import spacy

In [7]:
nlp = spacy.load('en_core_web_sm') # Model needs to downloaded: python -m spacy download en_core_web_sm 

In [8]:
def lower_replace(series):
    res = series.str.lower()
    res = res.str.replace(r'[^\w\s]', '', regex=True)
    res = res.str.replace(r'\d','', regex=True)
    res = res.str.replace('\n', ' ', regex=False)
    res = res.str.replace('\t', ' ', regex=False)
    res = res.str.replace('\r', ' ', regex=False)
    res = res.str.replace(r'\s+', ' ', regex=True)
    return res
    
def tokenize_lemmatize(text, remove_stop=True):
    doc = nlp(text)
    norm = []
    if remove_stop:
        norm = [token.lemma_ for token in doc if not token.is_stop and (len(token.text) > 1 or token.text=='i')]
    else:
        norm = [token.lemma_ for token in doc if len(token.text) > 1 or token.text=='i']
    return ' '.join(norm)

In [9]:
def normalize_text(series, remove_stop_words=True):
    res = lower_replace(series)
    res = res.apply(tokenize_lemmatize, remove_stop=remove_stop_words)
    return res

In [10]:
df.insert(loc=2,column='text_clean_nostop',value=normalize_text(df.comment_text))
df.insert(loc=3,column='text_clean_withstop',value=normalize_text(df.comment_text, remove_stop_words=False))
#df.insert(loc=2,column='text_clean_nostop',value=pd.read_pickle('text_clean_nostop.pkl'))
#df.insert(loc=3,column='text_clean_withstop',value=pd.read_pickle('text_clean_withstop.pkl'))
pd.to_pickle(df.text_clean_nostop, 'text_clean_nostop.pkl')
pd.to_pickle(df.text_clean_withstop, 'text_clean_withstop.pkl')

In [11]:
df.head(5)

Unnamed: 0,id,comment_text,text_clean_nostop,text_clean_withstop,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,003a19c04c079bf7,"LACK OF BALANCE\r\n\r\nThis article is seriously out of balance. It would benefit greatly from the introduction of additional information about the negative aspects of the T-34. Just for the record, this sort of balance problem is not uncommon when there is an over dependence on the works of a particular author. For example, testing and evaluation of a T-34 by the US Army Ordnance Dept. exposed some very serious problems with the tank. This type of information should be included in the article to bring it back into balance.",lack balance article seriously balance benefit greatly introduction additional information negative aspect record sort balance problem uncommon dependence work particular author example testing evaluation army ordnance dept expose problem tank type information include article bring balance,lack of balance this article be seriously out of balance it would benefit greatly from the introduction of additional information about the negative aspect of the just for the record this sort of balance problem be not uncommon when there be an over dependence on the work of particular author for example testing and evaluation of by the us army ordnance dept expose some very serious problem with the tank this type of information should be include in the article to bring it back into balance,0,0,0,0,0,0
1,001ffdcc3e7fb49c,Awesome! Then I'll simply disregard your notice. Thanks!,awesome ill simply disregard notice thank,awesome then ill simply disregard your notice thank,0,0,0,0,0,0
2,0294749a6add04c9,"Feather \r\n\r\nThere is no citation as to how a quill looked in the past. Therefore, it is opinion only.",feather citation quill look past opinion,feather there be no citation as to how quill look in the past therefore it be opinion only,0,0,0,0,0,0
3,009a3333aa4ac011,|listas = Sabina of Bavaria,listas sabina bavaria,listas sabina of bavaria,0,0,0,0,0,0
4,00a216c00b90ce88,"Gore response \r\n\r\nCan anybody find Gore's response to Bush's malaprop? Why didn't Gore, as the inventor of the darn thing, club him silly after that one?",gore response anybody find gore response bushs malaprop not gore inventor darn thing club silly,gore response can anybody find gore response to bushs malaprop why do not gore as the inventor of the darn thing club he silly after that one,0,0,0,0,0,0


## Step 2. Vectorization

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
def create_tfidf_df(series, ngram_range=(1,2)):
    tv = TfidfVectorizer(ngram_range=ngram_range)
    tfidf = tv.fit_transform(series)
    tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf, columns=tv.get_feature_names_out())
    return tfidf_df

In [14]:
def create_countv_df(series, ngram_range=(1,2)):
    cv = CountVectorizer(ngram_range=ngram_range)
    countv = cv.fit_transform(series)
    countv_df = pd.DataFrame.sparse.from_spmatrix(countv, columns=cv.get_feature_names_out())
    return countv_df

In [15]:
pd.to_pickle(create_tfidf_df(df.text_clean_nostop,ngram_range=(1,2)),'tfidf_nostop_df.pkl')
pd.to_pickle(create_tfidf_df(df.text_clean_withstop, ngram_range=(1,2)),'tfidf_withstop_df.pkl')

tfidf_nostop_df = pd.read_pickle('tfidf_nostop_df.pkl')
tfidf_withstop_df = pd.read_pickle('tfidf_withstop_df.pkl')

In [16]:
pd.to_pickle(create_countv_df(df.text_clean_nostop,ngram_range=(1,2)), 'countv_nostop_df.pkl')
pd.to_pickle(create_countv_df(df.text_clean_withstop, ngram_range=(1,2)), 'countv_withstop_df.pkl')
 
countv_nostop_df = pd.read_pickle('countv_nostop_df.pkl') 
countv_withstop_df = pd.read_pickle('countv_withstop_df.pkl')

### Step 3.Train

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [18]:
X_df = pd.read_pickle('countv_nostop_df.pkl')

In [19]:
y = df[['toxic','severe_toxic','obscene','threat','insult', 'identity_hate']]
#y

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

model=OneVsRestClassifier(MultinomialNB())
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.715
              precision    recall  f1-score   support

           0       0.21      0.43      0.29        23
           1       0.00      0.00      0.00         3
           2       0.18      0.70      0.29        10
           3       0.00      0.00      0.00         1
           4       0.17      0.50      0.25        12
           5       0.00      0.00      0.00         0

   micro avg       0.13      0.47      0.20        49
   macro avg       0.09      0.27      0.14        49
weighted avg       0.18      0.47      0.25        49
 samples avg       0.04      0.04      0.04        49



In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

model = OneVsRestClassifier(LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500))
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))



Accuracy: 0.89
              precision    recall  f1-score   support

           0       1.00      0.22      0.36        23
           1       0.00      0.00      0.00         3
           2       1.00      0.20      0.33        10
           3       0.00      0.00      0.00         1
           4       0.50      0.08      0.14        12
           5       0.00      0.00      0.00         0

   micro avg       0.89      0.16      0.28        49
   macro avg       0.42      0.08      0.14        49
weighted avg       0.80      0.16      0.27        49
 samples avg       0.02      0.01      0.02        49

