In [110]:
from typing import Dict
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [111]:
d = {'labels': [], 'text': [], 'line': []}
with open('spamAssassin.data') as file:
    for line in file:
        tokens = line.split(" ")
        d['labels'].append(int(tokens[0]))
        d['line'].append(" ".join(tokens[1:]))
        d['text'].append(np.array(tokens[1:]))

In [112]:
d['labels'][:5]

[1, 1, 0, 0, 0]

In [113]:
d['text'][0][:5]


array(['public', 'announc', 'the', 'new', 'domain'], dtype='<U24')

In [114]:
df = pd.DataFrame(d)

In [115]:
df


Unnamed: 0,labels,text,line
0,1,"[public, announc, the, new, domain, name, ar, ...",public announc the new domain name ar final av...
1,1,"[have, tax, problem, do, you, ow, the, ir, mon...",have tax problem do you ow the ir monei if you...
2,0,"[r, robert, harlei, write, r, scuse, me, for, ...",r robert harlei write r scuse me for post in g...
3,0,"[on, wed, number, oct, number, brian, fahrland...",on wed number oct number brian fahrland wrote ...
4,0,"[quot, ronan, waid, sure, but, soft, link, wou...",quot ronan waid sure but soft link would do th...
...,...,...,...
5995,0,"[on, fri, aug, number, number, at, number, num...",on fri aug number number at number number numb...
5996,0,"[i, am, try, to, secur, three, of, four, virtu...",i am try to secur three of four virtual hostna...
5997,0,"[on, number, septemb, number, tim, peter, said...",on number septemb number tim peter said would ...
5998,1,"[govern, grant, e, book, number, edit, katfish...",govern grant e book number edit katfishnumb yo...


In [116]:
def build_vocab_map(dataframe: pd.DataFrame) -> Dict:
    return Counter([word for sentence in dataframe['text'] for word in set(sentence)])

In [117]:
c = build_vocab_map(df)
c['public']

341

In [124]:
len(c)

80865

In [118]:
c2 = {k:v for k,v in c.items() if v >= 30}
c2['sincer']

141

In [119]:
len(c2)

3131

In [120]:
df['text']

0       [public, announc, the, new, domain, name, ar, ...
1       [have, tax, problem, do, you, ow, the, ir, mon...
2       [r, robert, harlei, write, r, scuse, me, for, ...
3       [on, wed, number, oct, number, brian, fahrland...
4       [quot, ronan, waid, sure, but, soft, link, wou...
                              ...                        
5995    [on, fri, aug, number, number, at, number, num...
5996    [i, am, try, to, secur, three, of, four, virtu...
5997    [on, number, septemb, number, tim, peter, said...
5998    [govern, grant, e, book, number, edit, katfish...
5999    [from, emailaddr, emailaddr, on, behalf, of, j...
Name: text, Length: 6000, dtype: object

In [121]:
vectorizer = CountVectorizer(vocabulary=c2.keys(), binary=True)
x = vectorizer.fit_transform(df['line'])

In [122]:
df2 = pd.DataFrame(x.toarray())

In [123]:
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3121,3122,3123,3124,3125,3126,3127,3128,3129,3130
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [125]:
df3 = pd.read_csv("X_train_binary.csv")
df3

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,2441,2442,2443,2444,2445,2446,2447,2448,2449,2450
0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4195,4195,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4196,4196,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4197,4197,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4198,4198,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
