In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import load_model
import pandas as pd
import re

In [None]:
# score comments on test dataset
def removeEmojis(text):
    pattern = re.compile(
        pattern = "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+", flags = re.UNICODE)
    return pattern.sub(r'', str(text))

def hasOnlyLatinCharsOrArabicNumerals(text):
    try:
        text.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return ''
    else:
        return text

filepath = 'datasets/YT API/mrkAmmMakMg_15323.csv'
filename = re.findall(r'[^/]+(?=.csv)', filepath)[0]
testdf = pd.read_csv(filepath)
# remove emojis
testdf['comment'] = testdf['comment'].apply(lambda s: removeEmojis(s))
# remove comments with non-latin alphabets or arabic numerals
testdf['comment'] = testdf['comment'].apply(lambda s: hasOnlyLatinCharsOrArabicNumerals(s))
# rempty comments and duplicates
testdf = testdf.replace('', float('NaN')).dropna()
testdf

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

progress = 0
length = len(testdf)

def pre_process(text):
    global progress
    progress += 1
    '''
    - transform to lowercase
    - remove links and mentions
    - remove all characters except whitespaces and latin characters
    - word tokenization
    - lemmatization
    '''
    removed_mentions = re.sub(r'\s?@(\s)*(\S*)\s?', ' ', text)
    removed_links = re.sub(r'((http|watch\?v=|[wW]{3})\S+)', ' ', removed_mentions)
    normalized = re.sub(r'[^A-Za-z]+', ' ', removed_links)
    tokens = word_tokenize(normalized)
    tokens = [word for word in tokens if not word in stopwords.words('english')]

    print(f'{progress / length * 100:.2f}%\t{progress}/{length}', end='\r')
    return ' '.join(tokens).lower()

model = load_model('spam_detector')
testdf['score'] = testdf['comment'].apply(lambda s: round(model.predict([pre_process(s)], verbose=0)[0][0] * 100, 2))
testdf.head()

In [None]:
# comments with 90% score or more is spam
testdf[testdf['score'] >= 90].sort_values(by='score', ascending=False).to_csv(f'spam/{filename}.csv', index=False)
testdf[testdf['score'] >= 90].sort_values(by='score', ascending=False)