## Imports

In [None]:
from sklearn import preprocessing
from sklearn import svm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

## Data loading

In [None]:
training_data = np.load('./data/training_sentences.npy', allow_pickle=True)
training_labels = np.load('./data/training_labels.npy', allow_pickle=True)

testing_data = np.load('./data/test_sentences.npy', allow_pickle=True)
testing_labels = np.load('./data/test_labels.npy', allow_pickle=True)

print(training_data[:2])
print(training_labels[:2])

## Sklearn SVM

### Data preparation

In [102]:
count_vect = TfidfVectorizer()
messages = [' '.join(item) for item in training_data]
messages_testing = [' '.join(item) for item in testing_data]

X = count_vect.fit_transform(messages)
Y = count_vect.transform(messages_testing)
X_train = X 
X_test = Y
y_train, y_test = training_labels, testing_labels

print(X_train.shape)
print(X_test.shape)
print(training_labels.shape)
print(testing_labels.shape)

(3734, 7701)
(1840, 7701)
(3734,)
(1840,)


### Model

In [103]:
model = svm.SVC(kernel='linear')

model.fit(X_train, y_train)

### Prediction

In [104]:
prediction = model.predict(X_test)
print(f1_score(y_test, prediction))

0.9452332657200812


### Most negative/positive words

In [107]:
feature_names = count_vect.get_feature_names_out()
coefficients = model.coef_.toarray()
top10_index = np.argsort(coefficients[0])[-10:]
top10_words = [feature_names[i] for i in top10_index]

bot10_index = np.argsort(coefficients[0])[:10]
bot10_words = [feature_names[i] for i in bot10_index]

print(f"Top 10 words that are spam: {top10_words}", end="\n\n")
print(f"Top 10 words that are not spam: {bot10_words}")

Top 10 words that are spam: ['18', 'reply', 'call', 'service', 'text', '500', 'stop', 'claim', 'mobile', 'txt']

Top 10 words that are not spam: ['me', 'im', 'ill', 'ok', 'gt', 'my', 'lt', 'him', 'sir', 'but']


## BagOfWords

In [None]:
class BagOfWords:
    def __init__(self):
        self.dict = {}
        self.words_in_order = []
        
    def build_vocabulary(self, data):
        id = 1
        for sentence in data:
            for word in sentence:
                if word not in self.dict:
                    self.dict[word] = id
                    self.words_in_order.append(word)
                    id += 1
                    
        print(len(self.dict))

    def get_features(self, data):
        matrix = []
        
        for sentence in data:
            vector = [0] * (len(self.dict) + 1)

            for word in sentence:
                if word in self.dict:
                    vector[self.dict[word]] += 1
                    
            matrix.append(vector)
        
        return np.array(matrix)
        

In [98]:
vectorizer = BagOfWords()
vectorizer.build_vocabulary(training_data)
new_train = vectorizer.get_features(training_data)
new_test = vectorizer.get_features(testing_data)

9522


## Normalizing data

In [99]:
def normalize_data(train_data, test_data, type=None):
    new_train_data = train_data
    new_test_data = test_data
    
    if type == 'standard':
        std_training = np.std(train_data, axis=0)
        mean_training = np.mean(train_data, axis=0)
        
        new_train_data = np.divide(train_data - mean_training, std_training)
        new_test_data = np.divide(test_data - mean_training, std_training)
    
    if type == 'l1':
        divisor = np.sum(np.abs(train_data), axis=0)
        
        new_train_data = np.divide(train_data, divisor)
        new_test_data = np.divide(test_data, divisor)
        
    
    if type == 'l2':
        divisor = np.sqrt(np.sum(np.power(train_data, 2), axis=0))

        new_train_data = np.divide(train_data, divisor)
        new_test_data = np.divide(test_data, divisor)
    

    return new_train_data, new_test_data

In [100]:
normalized_train, normalized_test = normalize_data(new_train, new_test) 

print(normalized_train[0])

print(normalized_train.shape)
print(normalized_test.shape)

[0 1 1 ... 0 0 0]
(3734, 9523)
(1840, 9523)


## Model with self normalization

In [101]:
model.fit(normalized_train, training_labels)

pred = model.predict(normalized_test)
score = f1_score(testing_labels, pred)
print(score)

0.9423868312757202
