In [16]:
import pickle
import numpy as np

## Importing Data

### TF-IDF

In [17]:
with open('../DATA/tfidf_data.pkl', 'rb') as f:
    data = pickle.load(f)

X_train_tfidf = data['X_train']
X_test_tfidf = data['X_test']
y_train_tfidf = data['y_train']
y_test_tfidf = data['y_test']
vectorizer_tfidf = data['vectorizer']

### Count Vectorizer

In [18]:
with open('../DATA/count_data.pkl', 'rb') as f:
    data = pickle.load(f)

X_train_count = data['X_train']
X_test_count = data['X_test']
y_train_count = data['y_train']
y_test_count = data['y_test']
vectorizer_count = data['vectorizer']

### Train Models

In [19]:
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("y_train_tfidf shape:", y_train_tfidf.shape)

print("X_test_tfidf shape:", X_test_tfidf.shape)
print("y_test_tfidf shape:", y_test_tfidf.shape)

print("\nX_train_count shape:", X_train_count.shape)
print("y_train_count shape:", y_train_count.shape)

print("X_test_count shape:", X_test_count.shape)
print("y_test_count shape:", y_test_count.shape)

X_train_tfidf shape: (127656, 5000)
y_train_tfidf shape: (127656, 6)
X_test_tfidf shape: (31915, 5000)
y_test_tfidf shape: (31915, 6)

X_train_count shape: (127656, 5000)
y_train_count shape: (127656, 6)
X_test_count shape: (31915, 5000)
y_test_count shape: (31915, 6)


In [26]:
from sklearn.linear_model import Perceptron
import numpy as np

# Toxic-only binary labels
y_train_toxic_tfidf = y_train_tfidf[:, 0].astype(int)
y_test_toxic_tfidf = y_test_tfidf[:, 0].astype(int)

y_train_toxic_count = y_train_count[:, 0].astype(int)
y_test_toxic_count = y_test_count[:, 0].astype(int)

# Training perceptrons
tfidf_perceptron = Perceptron()
tfidf_perceptron.fit(X_train_tfidf, y_train_toxic_tfidf)

count_perceptron = Perceptron()
count_perceptron.fit(X_train_count, y_train_toxic_count)

print("Training completed.")

Training completed.


### Comparing Models

In [None]:
from sklearn.metrics import accuracy_score

tfidf_train_pred = tfidf_perceptron.predict(X_train_tfidf)
tfidf_test_pred = tfidf_perceptron.predict(X_test_tfidf)

count_train_pred = count_perceptron.predict(X_train_count)
count_test_pred = count_perceptron.predict(X_test_count)

print(f'TF-IDF Perceptron Training Accuracy: {accuracy_score(y_train_toxic_tfidf, tfidf_train_pred):.4f}')
print(f'TF-IDF Perceptron Testing Accuracy: {accuracy_score(y_test_toxic_tfidf, tfidf_test_pred):.4f}')
print(f'Count Vectorizer Perceptron Training Accuracy: {accuracy_score(y_train_toxic_count, count_train_pred):.4f}')
print(f'Count Vectorizer Perceptron Testing Accuracy: {accuracy_score(y_test_toxic_count, count_test_pred):.4f}')


# Will add more ways to compare the two models

TF-IDF Perceptron Training Accuracy: 0.9493
TF-IDF Perceptron Testing Accuracy: 0.9399
Count Vectorizer Perceptron Training Accuracy: 0.9442
Count Vectorizer Perceptron Testing Accuracy: 0.9350


### Evaluating Metrics

In [30]:
if accuracy_score(y_test_toxic_tfidf, tfidf_test_pred) > accuracy_score(y_test_toxic_count, count_test_pred):
    print('TF-IDF performed better on the test set.')
else:
    print('Count Vectorizer performed better on the test set.')

TF-IDF performed better on the test set.
