In [36]:
import pathlib

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"

ZIPS_DIR = DATASET_DIR / 'zips'
ZIPS_DIR.mkdir(exist_ok=True, parents=True)

SPAM_SMS_ZIP_PATH = ZIPS_DIR / "sms-spam-dataset.zip"
SPAM_YOUTUBE_ZIP_PATH = ZIPS_DIR / "youtube-spam-dataset.zip"
METADATA_EXPORT_PATH = EXPORT_DIR/"spam-metadata.pkl"
TOEKNIZER_EXPORT_PATH = EXPORT_DIR/"spam-tokenizer.json"

In [2]:
SMS_SPAM_ZIP = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
YOUTUBE_SPAM_ZIP = "https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip"

In [3]:
!curl $SMS_SPAM_ZIP -o $SPAM_SMS_ZIP_PATH

!curl $YOUTUBE_SPAM_ZIP -o $SPAM_YOUTUBE_ZIP_PATH   

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
 46  198k   46 95232    0     0  95232      0  0:00:02  0:00:01  0:00:01 49548
100  198k  100  198k    0     0    99k      0  0:00:02  0:00:02 --:--:-- 97141
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 67  159k   67  108k    0     0   108k      0  0:00:01  0:00:01 --:--:-- 63488
100  159k  100  159k    0     0   159k      0  0:00:01  0:00:01 --:--:-- 93466


In [4]:
SPAM_CLASSIFIER_DIR = DATASET_DIR / "spam-classifier"
SMS_SPAM_DIR = SPAM_CLASSIFIER_DIR / "spam-sms"
YOUTUBE_SPAM_DIR = SPAM_CLASSIFIER_DIR / "youtube-spam"


SMS_SPAM_DIR.mkdir(exist_ok=True, parents=True)
YOUTUBE_SPAM_DIR.mkdir(exist_ok=True, parents=True)

In [5]:
!unzip -o $SPAM_SMS_ZIP_PATH -d $SMS_SPAM_DIR
!unzip -o $SPAM_YOUTUBE_ZIP_PATH -d $YOUTUBE_SPAM_DIR

Archive:  D:\data\Documents\ai_api\datasets\zips\sms-spam-dataset.zip
  inflating: D:\data\Documents\ai_api\datasets\spam-classifier\spam-sms/SMSSpamCollection  
  inflating: D:\data\Documents\ai_api\datasets\spam-classifier\spam-sms/readme  
Archive:  D:\data\Documents\ai_api\datasets\zips\youtube-spam-dataset.zip
  inflating: D:\data\Documents\ai_api\datasets\spam-classifier\youtube-spam/Youtube01-Psy.csv  
  inflating: D:\data\Documents\ai_api\datasets\spam-classifier\youtube-spam/__MACOSX/._Youtube01-Psy.csv  
  inflating: D:\data\Documents\ai_api\datasets\spam-classifier\youtube-spam/Youtube02-KatyPerry.csv  
  inflating: D:\data\Documents\ai_api\datasets\spam-classifier\youtube-spam/__MACOSX/._Youtube02-KatyPerry.csv  
  inflating: D:\data\Documents\ai_api\datasets\spam-classifier\youtube-spam/Youtube03-LMFAO.csv  
  inflating: D:\data\Documents\ai_api\datasets\spam-classifier\youtube-spam/__MACOSX/._Youtube03-LMFAO.csv  
  inflating: D:\data\Documents\ai_api\datasets\spam-classi

In [6]:
sms_spam_input_path = SMS_SPAM_DIR / "SMSSpamCollection" # tsv


In [7]:
import pandas as pd
sms_spam_input_path = SMS_SPAM_DIR / "SMSSpamCollection"
sms_df = pd.read_csv(sms_spam_input_path, sep='\t', header=None)
sms_df.columns = ['label', 'text']
sms_df['source'] = 'sms-spam'
sms_df.tail()

Unnamed: 0,label,text,source
5567,spam,This is the 2nd time we have tried 2 contact u...,sms-spam
5568,ham,Will ü b going to esplanade fr home?,sms-spam
5569,ham,"Pity, * was in mood for that. So...any other s...",sms-spam
5570,ham,The guy did some bitching but I acted like i'd...,sms-spam
5571,ham,Rofl. Its true to its name,sms-spam


In [8]:
my_dfs = []
for path in YOUTUBE_SPAM_DIR.glob("*.csv"):
    raw_df = pd.read_csv(path)
    raw_df.rename(columns={"CLASS": 'raw_label', "CONTENT": "text"}, inplace=True)
    raw_df['label'] = raw_df['raw_label'].apply(lambda x: "spam" if str(x) == "1" else "ham")
    raw_df['raw_source'] = str(path.name)
    raw_df['source'] = 'youtube-spam'
    df = raw_df.copy()[['label', 'text', 'source']]
    my_dfs.append(df)
    # print(df.head())

yt_df = pd.concat(my_dfs)

In [9]:
yt_df.head()


Unnamed: 0,label,text,source
0,spam,"Huh, anyway check out this you[tube] channel: ...",youtube-spam
1,spam,Hey guys check out my new channel and our firs...,youtube-spam
2,spam,just for test I have to say murdev.com,youtube-spam
3,spam,me shaking my sexy ass on my channel enjoy ^_^ ﻿,youtube-spam
4,spam,watch?v=vtaRGgvGtWQ Check this out .﻿,youtube-spam


In [10]:
df = pd.concat([sms_df, yt_df])
df.head()


Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [11]:
df.to_csv(SPAM_DATASET_PATH, index=False)


In [12]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [13]:
labels[120], texts[120]


('spam',
 'PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires')

In [14]:
label_legend = {"ham": 0, "spam": 1}
label_legend_inverted = {f"{v}": k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'spam'}

In [15]:
labels_as_int = [label_legend[x] for x in labels]


In [16]:
# label_legend_inverted[str(labels_as_int[120])]


In [17]:
import random 
random_idx = random.randint(0, len(labels))

assert texts[random_idx] == df.iloc[random_idx].text

assert labels[random_idx] == df.iloc[random_idx].label

assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [19]:
MAX_NUM_WORDS = 280


In [20]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [21]:
word_index = tokenizer.word_index

In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [23]:
MAX_SEQ_LENGTH = 300


In [24]:
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)


In [25]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [26]:
labels_as_int_array = np.asarray(labels_as_int)
labels_as_int_array

array([0, 0, 1, ..., 0, 0, 0])

In [27]:
y = to_categorical(labels_as_int_array)


In [29]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.1.1-cp38-cp38-win_amd64.whl (7.3 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.1.1 threadpoolctl-3.1.0


In [31]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 12)

In [32]:
import pickle

In [37]:
trainning_data = {
    "X_train":X_train, 
    "X_test":X_test, 
    "y_train":y_train, 
    "y_test":y_test,
    "max_words":MAX_NUM_WORDS,
    "max_seq_length":MAX_SEQ_LENGTH,
    "legend":label_legend,
    "legend_inverted":label_legend_inverted
}

tokenizer_json = tokenizer.to_json()
TOEKNIZER_EXPORT_PATH.write_text(tokenizer_json)

1090335

In [38]:
with open(METADATA_EXPORT_PATH, "wb") as f:
    pickle.dump(trainning_data, f)