In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('toxic_comment.csv')

In [3]:
data.head()

Unnamed: 0,CommentId,VideoId,Text,IsToxic
0,Ugg2KwwX0V8-aXgCoAEC,04kJtp6pVXI,If only people would just take a step back and...,False
1,Ugg2s5AzSPioEXgCoAEC,04kJtp6pVXI,Law enforcement is not trained to shoot to app...,True
2,Ugg3dWTOxryFfHgCoAEC,04kJtp6pVXI,\nDont you reckon them 'black lives matter' ba...,True
3,Ugg7Gd006w1MPngCoAEC,04kJtp6pVXI,There are a very large number of people who do...,False
4,Ugg8FfTbbNF8IngCoAEC,04kJtp6pVXI,"The Arab dude is absolutely right, he should h...",False


In [4]:
data.duplicated(subset=['CommentId', 'VideoId'], keep=False).sum()

np.int64(0)

In [5]:
def load_data(filename):
    data = pd.read_csv(filename)
    print('original data shape:', data.shape)

    dropped_duplicated_data = data.drop_duplicates(subset=['CommentId', 'VideoId'], keep=False)
    print('data shape after dropping duplicates:', dropped_duplicated_data.shape)

    dropped_columns_data = dropped_duplicated_data.drop(columns=['CommentId', 'VideoId'], axis=1)
    print('data shape after dropping columns:', dropped_columns_data.shape)

    return dropped_columns_data

In [6]:
data = load_data('toxic_comment.csv')

original data shape: (1000, 4)
data shape after dropping duplicates: (1000, 4)
data shape after dropping columns: (1000, 2)


In [7]:
# Data Preparation

In [8]:
from sklearn.model_selection import train_test_split

x = data.drop(columns=['IsToxic'], axis=1)
y = data['IsToxic']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [9]:
print(f'Training shape : X {x_train.shape}, y {y_train.shape}')
print(f'Test shape : X {x_test.shape}, y {y_test.shape}')

Training shape : X (750, 1), y (750,)
Test shape : X (250, 1), y (250,)


In [10]:
# Removing Digits
import re

def remove_digits(text):
    return re.sub(r'\d+', '', text)

x_train['Text'] = x_train['Text'].apply(remove_digits)

In [11]:
# Removing punctuations
import re
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [12]:
# Tokenize text
def preprocess_text(data, column):
    data[column] = data[column].str.lower()
    data[column] = data[column].apply(remove_digits)
    data[column] = data[column].apply(remove_punctuation)

    return data

In [13]:
x_train_preprocessed = preprocess_text(x_train, 'Text')

In [14]:
# Vectorizing

In [15]:
# One Hot Encoding
def create_vocabs(sentences):
    vocabs = []
    for text in sentences:
        for word in text.split():
            vocabs.append(word)

    return set(vocabs)

In [16]:
corpus = [
    'Hello World',
    'Welcomee to Disney World',
    'The flight is on delay'
]

In [17]:
# corpus = list(x_train_preprocessed['Text'])

In [18]:
vocabs = create_vocabs(corpus)

In [19]:
word_to_idx = {word: idx for idx, word in enumerate(vocabs)}
idx_to_word = {idx: word for idx, word in enumerate(vocabs)}

In [20]:
word_to_idx

{'is': 0,
 'The': 1,
 'Hello': 2,
 'Welcomee': 3,
 'on': 4,
 'to': 5,
 'World': 6,
 'Disney': 7,
 'flight': 8,
 'delay': 9}

In [21]:
idx_to_word

{0: 'is',
 1: 'The',
 2: 'Hello',
 3: 'Welcomee',
 4: 'on',
 5: 'to',
 6: 'World',
 7: 'Disney',
 8: 'flight',
 9: 'delay'}

In [22]:
number_of_vocabs = len(vocabs)
number_of_sentences = len(corpus)

word_presence_matrix = np.zeros(shape=(number_of_sentences, number_of_vocabs))
word_presence_matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [23]:
for idx, sentence in enumerate(corpus):
    for word in sentence.split():
        word_index = word_to_idx[word]
        word_presence_matrix[idx, word_index] = 1

word_presence_matrix

array([[0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 1., 1., 1., 0., 0.],
       [1., 1., 0., 0., 1., 0., 0., 0., 1., 1.]])

In [24]:
onehot_vector = pd.DataFrame(word_presence_matrix)
onehot_vector.columns = list(word_to_idx.keys())
onehot_vector.index = corpus
onehot_vector

Unnamed: 0,is,The,Hello,Welcomee,on,to,World,Disney,flight,delay
Hello World,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Welcomee to Disney World,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
The flight is on delay,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [30]:
class OneHotVectorizer:

    corpus = []
    vocabularies = set()
    word_to_idx = {}
    idx_to_word = {}

    def fit(self, corpus):
        self.corpus = corpus
        self.generate_vocabulary()
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocabularies)}
        self.idx_to_word = {idx: word for idx, word in enumerate(self.vocabularies)}

    def get_vocab(self) :
        return self.vocabularies

    def get_mapping(self) :
        return self.word_to_idx, self.idx_to_word

    def generate_vocabulary(self):
        self.vocabularies = set()
        for sentence in self.corpus:
            word_split = sentence.split()
            for word in word_split:
                self.vocabularies.add(word)

    def transform(self, text):
        number_of_sentences = number_of_sentences
        number_of_vocabs = len(self.vocabularies)
        text_features = np.zeros(shape=(number_of_sentences, number_of_vocabs))
        for idx, sentence in enumerate(text):
            for word in sentence.split():
                if word not in self.word_to_idx.keys():
                    continue

                word_index = self.word_to_idx[word]
                text_features[idx, word_index] = 1

        return text_features

In [31]:
onehot_vectorizer = OneHotVectorizer()
onehot_vectorizer.fit(corpus)
onehot_text_features = onehot_vectorizer.transform(corpus)

In [32]:
onehot_df = pd.DataFrame(onehot_text_features)
onehot_df.columns = list(onehot_vectorizer.get_mapping()[0].keys())
onehot_df.index = corpus
onehot_df

Unnamed: 0,is,The,Hello,Welcomee,on,to,World,Disney,flight,delay
Hello World,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Welcomee to Disney World,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
The flight is on delay,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [38]:
class CountVectorizer:

    corpus = []
    vocabularies = set()

    def fit(self, corpus):
        self.corpus = corpus
        self.generate_vocabularies()
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocabularies)}
        self.idx_to_word = {idx: word for idx, word in enumerate(self.vocabularies)}

    def get_vocab(self) :
        return self.vocabularies

    def get_mapping(self) :
        return self.word_to_idx, self.idx_to_word

    def generate_vocabularies(self):
        self.vocabularies = set()
        for sentence in self.corpus:
            word_split = sentence.split()
            for word in word_split:
                self.vocabularies.add(word)

    def transform(self, text):
        number_of_sentences = len(text)
        number_of_vocabs = len(self.vocabularies)
        text_features = np.zeros(shape=(number_of_sentences, number_of_vocabs))

        for idx, sentence in enumerate(text):
            for word in sentence.split():
                if word not in self.word_to_idx.keys():
                    continue

                word_index = self.word_to_idx[word]
                text_features[idx, word_index] += 1

        return text_features

In [39]:
corpus_bow = [
    'Hello World',
    'Welcomee to Disney World',
    'The flight is on delay',
    'The flight because the runaway is being used for another flight departure'
]

In [40]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(corpus_bow)
count_vectorizer_features = count_vectorizer.transform(corpus_bow)

count_vectorizer_features

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0.,
        0., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 1.],
       [1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 2., 1.,
        1., 0.]])

In [42]:
onehot_df = pd.DataFrame(count_vectorizer_features)
onehot_df.columns = list(count_vectorizer.get_mapping()[0].keys())
onehot_df.index = corpus_bow
onehot_df

Unnamed: 0,is,because,The,departure,Hello,Welcomee,on,to,runaway,being,World,another,Disney,the,flight,for,used,delay
Hello World,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Welcomee to Disney World,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
The flight is on delay,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
The flight because the runaway is being used for another flight departure,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,1.0,1.0,0.0
