In [32]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from collections import Counter

# Basa

The first dataframe is raw data with annotations and messages mixed together, separated by the tab character '\t'.

In [2]:
df_data_raw = pd.read_csv("data/dataset.tsv", header=None)

In [3]:
df_data_raw.head()

Unnamed: 0,0
0,ham\tHave your lunch and come quickly and open...
1,spam\tJoin the UK's horniest Dogging service a...
2,spam\tYOU ARE CHOSEN TO RECEIVE A £350 AWARD! ...
3,ham\talright tyler's got a minor crisis and ha...
4,ham\tSad story of a Man - Last week was my b'd...


In [4]:
print("Dataset size:", len(df_data_raw))

Dataset size: 3715


Let's check whether all the lines of the dataset have this formatting:

In [5]:
len(df_data_raw) == df_data_raw[0].map(lambda x: x.startswith('ham\t') or x.startswith('spam\t')).sum()

np.True_

Let us now parse the dataset automatically in order to separate labels from the messages:

In [6]:
df_data = pd.read_csv("data/dataset.tsv", header=None, sep='\t', names=['label', 'message_raw'], on_bad_lines='skip')

In [7]:
df_data.head()

Unnamed: 0,label,message_raw
0,ham,Have your lunch and come quickly and open the ...
1,spam,Join the UK's horniest Dogging service and u c...
2,spam,YOU ARE CHOSEN TO RECEIVE A £350 AWARD! Pls ca...
3,ham,alright tyler's got a minor crisis and has to ...
4,ham,Sad story of a Man - Last week was my b'day. M...


As one can notice, we dropped the 'bad lines' while parsing the data automatically, and it turnes out that there's only one bad line:

In [8]:
print(len(df_data_raw), len(df_data))

3715 3714


The reason it was dropped is that it contains another tab character making it impossible for pandas' read_csv to parse it automatically. However, it's just one line, which is negligible compared to our dataset size, so we'll proceed without it:

In [9]:
df_data_raw.iloc[801, 0]

'ham\tHI BABE UAWAKE?FEELLIKW SHIT.JUSTFOUND OUT VIA ALETTER THATMUM GOTMARRIED 4thNOV.BEHIND OURBACKS \x96 FUCKINNICE!SELFISH\tDEVIOUSBITCH.ANYWAYI\x92L CALL U"""'

In [10]:
df_data

Unnamed: 0,label,message_raw
0,ham,Have your lunch and come quickly and open the ...
1,spam,Join the UK's horniest Dogging service and u c...
2,spam,YOU ARE CHOSEN TO RECEIVE A £350 AWARD! Pls ca...
3,ham,alright tyler's got a minor crisis and has to ...
4,ham,Sad story of a Man - Last week was my b'day. M...
...,...,...
3709,ham,Hello which the site to download songs its urg...
3710,ham,"My planning usually stops at ""find hella weed ..."
3711,ham,Are u awake? Is there snow there?
3712,ham,Sorry I'll call later


Let us first clean the messages and convert them to lists of words.

In [11]:
def basic_clean(text):
    """Remove all characters which are not letters
    or spaces and make the rest lowercase."""
    text_cleaned = re.sub(r'[^a-z ]', '', text.lower())
    return text_cleaned

In [12]:
lists_of_words = df_data['message_raw'].map(basic_clean).str.split()

In [13]:
lists_of_words

0       [have, your, lunch, and, come, quickly, and, o...
1       [join, the, uks, horniest, dogging, service, a...
2       [you, are, chosen, to, receive, a, award, pls,...
3       [alright, tylers, got, a, minor, crisis, and, ...
4       [sad, story, of, a, man, last, week, was, my, ...
                              ...                        
3709    [hello, which, the, site, to, download, songs,...
3710    [my, planning, usually, stops, at, find, hella...
3711              [are, u, awake, is, there, snow, there]
3712                            [sorry, ill, call, later]
3713                       [you, can, never, do, nothing]
Name: message_raw, Length: 3714, dtype: object

Next, let's lemmatize those words and remove stopwords.

In [14]:
stop_words = pd.read_csv("data/stop_words.csv", header=None).squeeze().to_list()

In [15]:
lemmatizer = WordNetLemmatizer()
df_data['words'] = lists_of_words.map(
    lambda words: [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
)

In [16]:
df_data['words']

0                      [lunch, come, quickly, open, door]
1       [join, uk, horniest, dogging, service, u, sex,...
2       [chosen, receive, award, pls, call, claim, num...
3       [alright, tyler, got, minor, crisis, home, soo...
4       [sad, story, man, last, week, bday, wife, didn...
                              ...                        
3709           [hello, site, download, song, urgent, pls]
3710    [planning, usually, stop, find, hella, weed, s...
3711                                     [u, awake, snow]
3712                            [sorry, ill, call, later]
3713                                     [never, nothing]
Name: words, Length: 3714, dtype: object

Let's output top-10 most occurring words for ham and spam messages.

In [17]:
df_data_long = df_data[df_data['words'].map(len) >= 3]

In [18]:
ham_words = df_data_long.loc[df_data_long['label'] == 'ham', 'words'].to_list()
spam_words = df_data_long.loc[df_data_long['label'] == 'spam', 'words'].to_list()

ham_words_flattened = [word for message in ham_words for word in message]
spam_words_flattened = [word for message in spam_words for word in message]

In [19]:
fdist_ham = FreqDist(ham_words_flattened)
fdist_spam = FreqDist(spam_words_flattened)

In [27]:
fdist_ham.most_common(10)

[('u', 681),
 ('im', 303),
 ('get', 195),
 ('dont', 190),
 ('go', 170),
 ('come', 165),
 ('like', 165),
 ('know', 163),
 ('ill', 159),
 ('ltgt', 157)]

In [21]:
fdist_spam.most_common(10)

[('call', 243),
 ('free', 147),
 ('u', 106),
 ('text', 94),
 ('mobile', 92),
 ('ur', 92),
 ('txt', 90),
 ('claim', 75),
 ('stop', 73),
 ('reply', 69)]

# xHardcorex

In [38]:
all_words = df_data['words'].to_list()
all_words_flattened = [word for message in all_words for word in message]
fdist_all = FreqDist(all_words_flattened)
frequent_long_words = sorted([word for word, count in fdist_all.most_common() if count >= 10 and len(word) >= 3])

In [51]:
def frequent_words_counter(words, vocabulary):
    counter = Counter(w for w in words if w in vocabulary)
    return [counter[word] for word in words]

In [52]:
df_data['popular_word_frequency'] = df_data['words'].map(lambda words: frequent_words_counter(words, vocabulary=frequent_long_words))

In [53]:
df_data[['words', 'popular_word_frequency']]

Unnamed: 0,words,popular_word_frequency
0,"[lunch, come, quickly, open, door]","[1, 1, 0, 1, 1]"
1,"[join, uk, horniest, dogging, service, u, sex,...","[1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0]"
2,"[chosen, receive, award, pls, call, claim, num...","[0, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 0, 1, 1]"
3,"[alright, tyler, got, minor, crisis, home, soo...","[1, 0, 1, 0, 0, 1, 0, 1, 0]"
4,"[sad, story, man, last, week, bday, wife, didn...","[1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 1, 1, ..."
...,...,...
3709,"[hello, site, download, song, urgent, pls]","[1, 0, 1, 1, 1, 1]"
3710,"[planning, usually, stop, find, hella, weed, s...","[0, 0, 1, 1, 0, 0, 1, 0, 0]"
3711,"[u, awake, snow]","[0, 0, 1]"
3712,"[sorry, ill, call, later]","[1, 1, 1, 1]"


In [54]:
def word_frequency_vector(tokens, vocab):
    fdist = FreqDist(tokens)
    total = len(tokens)
    return [fdist[word] / total if total > 0 else 0.0 for word in vocab]

df_data['frequency_vector'] = df_data['words'].map(lambda words: word_frequency_vector(words, vocab=frequent_long_words))