<h1 style="text-align: center">Classifying r/science comments on the basis of whether they should be kept or banned</h1>

<h3>First, let's import the necessary libraries</h3>

In [76]:
import pandas as pd
import sklearn.utils
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np

<h3>Reading the csv files containing reddit comments and their classification (kept or banned)</h3>

In [2]:
train_raw = pd.read_csv('data/reddit_train.csv', encoding = 'latin-1')
test_raw = pd.read_csv('data/reddit_test.csv', encoding = 'latin-1')

<h3>Preprocess training data</h3>

In [41]:
#English Stopwords and Punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(['.','?','!',',','@',':',';',"'","\"","\\","/","(",")",'-','_','+','=','*','&','^','%','$','#','@','`','~',"[","{","}","]",'|',"<",">"])

#Shuffle the dataframe
train_raw = sklearn.utils.shuffle(train_raw)
test_raw = sklearn.utils.shuffle(test_raw)

#Separate text and labels
train_text = list(train_raw["BODY"])
train_labels = list(train_raw["REMOVED"])
test_text = list(test_raw["BODY"])
test_labels = list(test_raw["REMOVED"])

#Remove stopwords and punctuation from sentence
for i,text in enumerate(train_text):
    sentence_processed = ""
    words = word_tokenize(text)
    for word in words:
        if word not in stop_words and word not in punctuation:
            sentence_processed+=word+" "
    train_text[i]=sentence_processed

for i,text in enumerate(test_text):
    sentence_processed = ""
    words = word_tokenize(text)
    for word in words:
        if word not in stop_words and word not in punctuation:
            sentence_processed+=word+" "
    test_text[i]=sentence_processed

In [45]:
#Get "Bag of Words" using CountVectorizer
count_vect = CountVectorizer()
train_text_counts = count_vect.fit_transform(train_text)
test_text_counts = count_vect.transform(test_text)

#Downscale to Term Frequencies
tfidf_transformer = TfidfTransformer()
train_text_tfidf = tfidf_transformer.fit_transform(train_text_counts)
test_text_tfidf = tfidf_transformer.fit_transform(test_text_counts)

<h3>Train a Naive Bayes Multinomial Classifier</h3>

In [46]:
clf = MultinomialNB().fit(train_text_tfidf, train_labels)

<h3>Evaluate NB Multinomial Classifier</h3>

In [56]:
predicted = clf.predict(test_text_tfidf)
acc = np.mean(predicted==test_labels)
print(acc)

0.6946983546617916


<h3>Achieved a 69.47% accuracy with the NB Multinomial Classifier</h3>

<h3>Train a Support Vector Machine (SVM)</h3>

In [73]:
clf_svm = SGDClassifier(loss='hinge', penalty='l2',
                        random_state=42, max_iter=5, 
                        tol=None).fit(train_text_tfidf, train_labels)


<h3>Evaluate SVM Classifier</h3>

In [74]:
predicted = clf_svm.predict(test_text_tfidf)
acc = np.mean(predicted==test_labels)
print(acc)

0.7404021937842779


<h3>Achieved a 74.04% accuracy with the SVM</h3>

<h3>Train a Multi-Layer Perceptron Classifier (MLP)</h3>

In [87]:
clf_mlp = MLPClassifier(random_state=42, 
                        early_stopping=True, 
                        n_iter_no_change=5).fit(train_text_tfidf,train_labels)

<h3>Evaluate MLP Classifier</h3>

In [88]:
predicted = clf_mlp.predict(test_text_tfidf)
acc = np.mean(predicted==test_labels)
print(acc)

0.7515117423709745


<h3>Achieved a 75.15% accuracy with the MLP</h3>