In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df[(df['identity_hate'] == 1) & (df['toxic'] == 1)].count()

id               1302
comment_text     1302
toxic            1302
severe_toxic     1302
obscene          1302
threat           1302
insult           1302
identity_hate    1302
dtype: int64

In [4]:
df.drop('id', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
def conditions(row):
    if row['severe_toxic'] == 1 or row['toxic'] == 1 or row['obscene'] == 1 or row['threat'] == 1 or row['insult'] == 1 or row['identity_hate'] == 1:
        val = 1
    else:
        val = 0
    return val

#Apply the function to each data point in the data frame
df['OverAll_toxic']= df.apply(conditions, axis=1)

In [7]:
df.head(10)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,OverAll_toxic
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0
5,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,1
7,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,0
8,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,0
9,alignment on this subject and which are contra...,0,0,0,0,0,0,0


In [8]:
df.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], inplace=True, axis = 1)

In [9]:
df.head()

Unnamed: 0,comment_text,OverAll_toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [10]:
df.isna().sum()

comment_text     0
OverAll_toxic    0
dtype: int64

In [11]:
df['OverAll_toxic'].value_counts()

0    143346
1     16225
Name: OverAll_toxic, dtype: int64

In [12]:
df[df['OverAll_toxic'] == 0]

Unnamed: 0,comment_text,OverAll_toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = df.drop('OverAll_toxic', axis=1)

In [15]:
y = df['OverAll_toxic']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df['comment_text'], df['OverAll_toxic'], test_size=0.15)

In [17]:
y_test.value_counts()

0    21518
1     2418
Name: OverAll_toxic, dtype: int64

In [18]:
y_train.value_counts()

0    121828
1     13807
Name: OverAll_toxic, dtype: int64

In [19]:
# Clean text
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
import string 
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
snowball = SnowballStemmer(language="english")
stop_words = stopwords.words("english")
def split_sentence_to_tokens(sentence: str):
    tokens = word_tokenize(sentence)
    tokens = [i for i in tokens if i not in string.punctuation]
    tokens = [i for i in tokens if i not in stop_words]
    tokens = [snowball.stem(i) for i in tokens]
    return tokens

In [21]:
vectorizer = TfidfVectorizer(tokenizer=lambda x : split_sentence_to_tokens(x))

In [22]:
X_train = vectorizer.fit_transform(X_train)

In [23]:
model = LogisticRegression(max_iter=150000)

In [24]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=150000)

In [25]:
X_test = vectorizer.transform(X_test)

In [26]:
model.score(X_test, y_test)

0.956759692513369

In [27]:
from sklearn.pipeline import Pipeline

In [38]:
model_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=lambda x: split_sentence_to_tokens(x))),
    ('model', LogisticRegression(max_iter=150000))
])

In [39]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(df['comment_text'], df['OverAll_toxic'], test_size=0.15)

In [40]:
model_pipeline.fit(X_train2,y_train2)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x00000223D7B4C700>)),
                ('model', LogisticRegression(max_iter=150000))])

In [31]:
model_pipeline.score(X_test2, y_test2)

0.9596841577540107

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
grid_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=lambda x: split_sentence_to_tokens(x))),
    ('model',
     GridSearchCV(
     LogisticRegression(max_iter=150000),
     param_grid={'C': [0.1,1,10]},
     cv=3,
     verbose=4
     )
    )
    
])

In [34]:
grid_pipeline.fit(X_train2, y_train2)


Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV 1/3] END .............................C=0.1;, score=0.929 total time=   5.1s
[CV 2/3] END .............................C=0.1;, score=0.929 total time=   4.4s
[CV 3/3] END .............................C=0.1;, score=0.929 total time=   4.4s
[CV 1/3] END ...............................C=1;, score=0.953 total time=  10.6s
[CV 2/3] END ...............................C=1;, score=0.953 total time=   9.2s
[CV 3/3] END ...............................C=1;, score=0.953 total time=  11.3s
[CV 1/3] END ..............................C=10;, score=0.958 total time=  18.9s
[CV 2/3] END ..............................C=10;, score=0.957 total time=  23.5s
[CV 3/3] END ..............................C=10;, score=0.957 total time=  19.9s


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x00000223E0B9B0D0>)),
                ('model',
                 GridSearchCV(cv=3,
                              estimator=LogisticRegression(max_iter=150000),
                              param_grid={'C': [0.1, 1, 10]}, verbose=4))])

In [35]:
from sklearn.naive_bayes import MultinomialNB
model_multinomialNB_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=lambda x: split_sentence_to_tokens(x))),
    ('model', MultinomialNB())
])

In [36]:
model_multinomialNB_pipeline.fit(X_train2, y_train2)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x00000223E0A96C10>)),
                ('model', MultinomialNB())])

In [37]:
model_multinomialNB_pipeline.score(X_test2, y_test2)

0.9192847593582888

In [46]:
def recognize_toxic_comment(comment):
    str = [comment]
    return model_pipeline.predict(str)[0]

In [47]:
recognize_toxic_comment("Hey, how are you?")

0