In [47]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("soham.csv", sep=',')
data.columns = ['index', 'user_index', 'body_text', '#_comments', '#_subs', 'membership_duration','#_uploads','profanity_in_name','age', 'label']

del data['index']
del data['user_index']
del data['membership_duration']

def hate_or_not(number):
    if number == 0:
        number = 'not_hate'
    if number == 1:
        number = 'hate'
    return number

def clean_text(text): 
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text
    
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))
data['label'] = data['label'].apply(lambda x: hate_or_not(x))


data['body_text'] = data['body_text'].apply(lambda x: clean_text(x))

data.head()



Unnamed: 0,body_text,#_comments,#_subs,#_uploads,profanity_in_name,age,label,body_len,punct%
0,"[nebodyels, hear, crazi, ass, screamin, hoe, e...",10,1,3,0,15,not_hate,395,8.1
1,"[mani, thing, incorrect, comment, unbeliev, gu...",3,0,5,0,31,not_hate,506,4.2
2,"[326, hahah, boyfriend, show, song, love, tooo...",7,0,5,0,43,hate,145,3.4
3,"[dick, beyonc, fuck, ass, hole, trueli, dog, b...",34,0,5,0,44,hate,670,1.9
4,"[donghaetaemin, kai, luhansehun, bacon, xd, ta...",11,173,5,0,21,not_hate,439,4.6


In [50]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time


In [160]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%', '#_comments', '#_subs', '#_uploads','profanity_in_name','age']], data['label'], test_size=0.2)

In [161]:

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])


tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%', '#_comments', '#_subs', '#_uploads','profanity_in_name','age']].reset_index(drop=True), pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%', '#_comments', '#_subs', '#_uploads','profanity_in_name','age']].reset_index(drop=True), pd.DataFrame(tfidf_test.toarray())], axis=1)


In [162]:
rf = RandomForestClassifier(n_estimators=100, max_depth=50, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect.values, y_train.values)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect.values)
end = time.time()
pred_time = (end - start)


precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='hate', average='binary')
print('Fit time: {} / Predict time: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fit_time, 3), round(pred_time, 3),
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 0.42 / Predict time: 0.045 / Precision: 0.0 / Recall: 0.0 / Accuracy: 0.877


  _warn_prf(average, modifier, msg_start, len(result))


In [156]:
gb = GradientBoostingClassifier(n_estimators=50, max_depth=7)

start = time.time()
gb_model = gb.fit(X_train_vect.values, y_train.values)
end = time.time()
fit_time = (end - start)


start = time.time()
y_pred = gb_model.predict(X_test_vect.values)
end = time.time()
pred_time = (end - start)


precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='hate', average='binary')
print('Fit time: {} / Predict time: {} / Precision: {} / Recall: {} / Accuracy: {}'.format(round(fit_time, 3), round(pred_time, 3),
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 9.058 / Predict time: 0.039 / Precision: 0.333 / Recall: 0.012 / Accuracy: 0.883
