In [167]:
import numpy as np
import pandas as pd

In [168]:
data = pd.read_csv('toxic_comment.csv')

In [169]:
data.head()

Unnamed: 0,CommentId,VideoId,Text,IsToxic
0,Ugg2KwwX0V8-aXgCoAEC,04kJtp6pVXI,If only people would just take a step back and...,False
1,Ugg2s5AzSPioEXgCoAEC,04kJtp6pVXI,Law enforcement is not trained to shoot to app...,True
2,Ugg3dWTOxryFfHgCoAEC,04kJtp6pVXI,\nDont you reckon them 'black lives matter' ba...,True
3,Ugg7Gd006w1MPngCoAEC,04kJtp6pVXI,There are a very large number of people who do...,False
4,Ugg8FfTbbNF8IngCoAEC,04kJtp6pVXI,"The Arab dude is absolutely right, he should h...",False


In [170]:
data.duplicated(subset=['CommentId', 'VideoId'], keep=False).sum()

np.int64(0)

In [171]:
def load_data(filename):
    data = pd.read_csv(filename)
    print('original data shape:', data.shape)

    dropped_duplicated_data = data.drop_duplicates(subset=['CommentId', 'VideoId'], keep=False)
    print('data shape after dropping duplicates:', dropped_duplicated_data.shape)

    dropped_columns_data = dropped_duplicated_data.drop(columns=['CommentId', 'VideoId'], axis=1)
    print('data shape after dropping columns:', dropped_columns_data.shape)

    return dropped_columns_data

In [172]:
data = load_data('toxic_comment.csv')

original data shape: (1000, 4)
data shape after dropping duplicates: (1000, 4)
data shape after dropping columns: (1000, 2)


In [173]:
# Data Preparation

In [174]:
from sklearn.model_selection import train_test_split

x = data.drop(columns=['IsToxic'], axis=1)
y = data['IsToxic']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [175]:
print(f'Training shape : X {x_train.shape}, y {y_train.shape}')
print(f'Test shape : X {x_test.shape}, y {y_test.shape}')

Training shape : X (750, 1), y (750,)
Test shape : X (250, 1), y (250,)


In [176]:
# Removing Digits
import re

def remove_digits(text):
    return re.sub(r'\d+', '', text)

x_train['Text'] = x_train['Text'].apply(remove_digits)

In [177]:
# Removing punctuations
import re
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [178]:
# Tokenize text
def preprocess_text(data, column):
    data[column] = data[column].str.lower()
    data[column] = data[column].apply(remove_digits)
    data[column] = data[column].apply(remove_punctuation)

    return data

In [179]:
x_train_preprocessed = preprocess_text(x_train, 'Text')

In [180]:
# Vectorizing

In [181]:
# One Hot Encoding
def create_vocabs(sentences):
    vocabs = []
    for text in sentences:
        for word in text.split():
            vocabs.append(word)

    return set(vocabs)

In [182]:
corpus = [
    'Hello World',
    'Welcomee to Disney World',
    'The flight is on delay'
]

In [183]:
# corpus = list(x_train_preprocessed['Text'])

In [184]:
vocabs = create_vocabs(corpus)

In [185]:
word_to_idx = {word: idx for idx, word in enumerate(vocabs)}
idx_to_word = {idx: word for idx, word in enumerate(vocabs)}

In [186]:
word_to_idx

{'flight': 0,
 'World': 1,
 'delay': 2,
 'The': 3,
 'to': 4,
 'Disney': 5,
 'on': 6,
 'Welcomee': 7,
 'Hello': 8,
 'is': 9}

In [187]:
idx_to_word

{0: 'flight',
 1: 'World',
 2: 'delay',
 3: 'The',
 4: 'to',
 5: 'Disney',
 6: 'on',
 7: 'Welcomee',
 8: 'Hello',
 9: 'is'}

In [188]:
number_of_vocabs = len(vocabs)
number_of_sentences = len(corpus)

word_presence_matrix = np.zeros(shape=(number_of_sentences, number_of_vocabs))
word_presence_matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [189]:
for idx, sentence in enumerate(corpus):
    for word in sentence.split():
        word_index = word_to_idx[word]
        word_presence_matrix[idx, word_index] = 1

word_presence_matrix

array([[0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 1., 0., 1., 0., 0.],
       [1., 0., 1., 1., 0., 0., 1., 0., 0., 1.]])

In [190]:
onehot_vector = pd.DataFrame(word_presence_matrix)
onehot_vector.columns = list(word_to_idx.keys())
onehot_vector.index = corpus
onehot_vector

Unnamed: 0,flight,World,delay,The,to,Disney,on,Welcomee,Hello,is
Hello World,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Welcomee to Disney World,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
The flight is on delay,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [191]:
class OneHotVectorizer:

    corpus = []
    vocabularies = set()
    word_to_idx = {}
    idx_to_word = {}

    def fit(self, corpus):
        self.corpus = corpus
        self.generate_vocabulary()
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocabularies)}
        self.idx_to_word = {idx: word for idx, word in enumerate(self.vocabularies)}

    def get_vocab(self) :
        return self.vocabularies

    def get_mapping(self) :
        return self.word_to_idx, self.idx_to_word

    def generate_vocabulary(self):
        self.vocabularies = set()
        for sentence in corpus:
            word_split = sentence.split()
            for word in word_split:
                self.vocabularies.add(word)

    def transform(self, text):
        text_features = np.zeros(shape=(number_of_sentences, number_of_vocabs))
        for idx, sentence in enumerate(text):
            for word in sentence.split():
                if word not in self.word_to_idx.keys():
                    continue

                word_index = self.word_to_idx[word]
                text_features[idx, word_index] = 1

        return text_features

In [192]:
onehot_vectorizer = OneHotVectorizer()
onehot_vectorizer.fit(corpus)
onehot_text_features = onehot_vectorizer.transform(corpus)

In [193]:
onehot_df = pd.DataFrame(onehot_text_features)
onehot_df.columns = list(onehot_vectorizer.get_mapping()[0].keys())
onehot_df.index = corpus
onehot_df

Unnamed: 0,flight,World,delay,The,to,Disney,on,Welcomee,Hello,is
Hello World,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Welcomee to Disney World,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
The flight is on delay,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
