In [None]:
import pandas as pd
import spacy
from pandarallel import pandarallel
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
nlp = spacy.load("en_core_web_sm")
pandarallel.initialize(progress_bar=False)
wst = WhitespaceTokenizer()

In [None]:
df_train = pd.read_csv('../input/ruddit-jigsaw-dataset-combined-cleaned/toxic_train.csv')

In [None]:
df_train.head(10)

In [None]:
df_validate = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

In [None]:
df_validate.head(10)

In [None]:
df_test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
df_test.head(10)

### Data Clean Functions

In [None]:
def dropNA(df):
    return df.dropna()

In [None]:
def removeDeletedPandas(data, column):
    text = data[column]
    if text.strip() == '[deleted]':
        return True

def dropDeletedComments(df, text_column):
    df['isDeleted'] = df.parallel_apply(removeDeletedPandas, axis=1, args=(text_column,))
    df = df[(df['isDeleted'] != True)]
    df = df.drop('isDeleted', axis=1)
    return df

In [None]:
def removeNewLinesPandas(data, column):
    text = data[column]
    return " ".join(text.split())

def removeNewLines(df, text_column, out_column):
    df[out_column] = df.parallel_apply(removeNewLinesPandas, axis=1, args=(text_column,))
    return df

In [None]:
def lemmatizeSpacyPandas(data, column):
    text = data[column]
    doc = nlp(text)
    text_to_return = ''
    for token in doc:
        text_to_return = text_to_return + token.lemma_ + " "
    return text_to_return

def lemmatizeComments(df, text_column, out_column):
    df[out_column] = df.parallel_apply(lemmatizeSpacyPandas, axis=1, args=(text_column,))
    return df

In [None]:
def removePunctuationsAndNumbersPandas(data, column):
    text_split = wst.tokenize(data[column])
    noPunctAndNumbers = ' '.join(word_tokenize(' '.join([''.join([char for char in word if char.isalpha()]).lower() for word in text_split])))
    return noPunctAndNumbers

def removePunctuationsAndNumbers(df, text_column, out_column):
    df[out_column] = df.parallel_apply(removePunctuationsAndNumbersPandas, axis=1, args=(text_column,))
    return df

In [None]:
def removeNonAlphaPandas(data, column):
    string = data[column]
    space_split_str = string.split()
    cleaned_words = []
    for word in space_split_str:
        cleaned_word = ''
        for c in word:
            if c.isalpha():
                cleaned_word = cleaned_word + c
        cleaned_words.append(cleaned_word)
    
    return " ".join(cleaned_words)

def removeNonAlpha(df, text_column, out_column):
    df[out_column] = df.parallel_apply(removeNonAlphaPandas, axis=1, args=(text_column,))
    return df

In [None]:
def cleanStopWordsPandas(data, column):
    string = data[column]
    spacy_str = nlp(string)
    spacy_str_tokens = []
    
    for token in spacy_str:
        spacy_str_tokens.append(token.text)
        
    filtered_str =[] 

    for word in spacy_str_tokens:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_str.append(word) 
    
    return " ".join(filtered_str)

def cleanStopWords(df, text_column, out_column):
    df[out_column] = df.parallel_apply(cleanStopWordsPandas, axis=1, args=(text_column,))
    return df

#### Clean Validation Data

In [None]:
df_validate.head(10)

In [None]:
df_validate = dropNA(df_validate)
df_validate = removeNewLines(df_validate, 'less_toxic', 'less_toxic_processed')
df_validate = removeNewLines(df_validate, 'more_toxic', 'more_toxic_processed')
df_validate = lemmatizeComments(df_validate, 'less_toxic_processed', 'less_toxic_processed')
df_validate = lemmatizeComments(df_validate, 'more_toxic_processed', 'more_toxic_processed')

In [None]:
df_validate = removePunctuationsAndNumbers(df_validate, 'less_toxic_processed', 'less_toxic_processed')
df_validate = removePunctuationsAndNumbers(df_validate, 'more_toxic_processed', 'more_toxic_processed')
df_validate = removeNonAlpha(df_validate, 'less_toxic_processed', 'less_toxic_processed')
df_validate = removeNonAlpha(df_validate, 'more_toxic_processed', 'more_toxic_processed')
df_validate = cleanStopWords(df_validate, 'less_toxic_processed', 'less_toxic_processed')
df_validate = cleanStopWords(df_validate, 'more_toxic_processed', 'more_toxic_processed')

In [None]:
df_validate.head(10)

#### Clean Test Data

In [None]:
df_test.head(10)

In [None]:
df_test = dropNA(df_test)
df_test = removeNewLines(df_test, 'text', 'processed')
df_test = lemmatizeComments(df_test, 'processed', 'processed')

In [None]:
df_test = removePunctuationsAndNumbers(df_test, 'processed', 'processed')
df_test = removeNonAlpha(df_test, 'processed', 'processed')
df_test = cleanStopWords(df_test, 'processed', 'processed')

In [None]:
df_test.head(10)

#### Create a Binary Field For Classification

In [None]:
df_train['isOffensive'] = df_train.parallel_apply(lambda x: 1 if (x['offensiveness_score'] > 0) else 0, axis=1)

#### Model 2: Tfidf, Binary Y, Naive Bayes

In [None]:
df_train.head(10)

In [None]:
tfidfVectorizer = TfidfVectorizer()

In [None]:
nb_tfidf_model = MultinomialNB(alpha=0.139)

In [None]:
X = tfidfVectorizer.fit_transform(df_train['processed'])
Y = df_train['isOffensive']

In [None]:
nb_tfidf_model.fit(X, Y)

In [None]:
X_validate_less_toxic = tfidfVectorizer.transform(df_validate['less_toxic_processed'])
X_validate_more_toxic = tfidfVectorizer.transform(df_validate['more_toxic_processed'])

In [None]:
Y_validate_less_toxic = nb_tfidf_model.predict_proba(X_validate_less_toxic)
Y_validate_more_toxic = nb_tfidf_model.predict_proba(X_validate_more_toxic)

In [None]:
(Y_validate_less_toxic[:, 1] < Y_validate_more_toxic[:, 1]).mean()

In [None]:
X_test = tfidfVectorizer.transform(df_test['processed'])

In [None]:
Y_test = nb_tfidf_model.predict_proba(X_test)

In [None]:
df_test['score'] = Y_test[:, 1]

In [None]:
df_test.head(10)

In [None]:
df_test[['comment_id', 'score']].to_csv("submission.csv", index=False)