In [16]:
#Dependencies
import pandas as pd
import psycopg2
from keys import db_user, db_password, db_name, db_host, db_port
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle

In [17]:
sql = """
select comment_body, 1 as "is_bot" from sus_user_comments where length(comment_body) > 15
union
select comment_body, 0 as "is_bot" from(
	select * from norm_user_comments
	where length(comment_body) > 10
	order by random()
	limit 5311) as foo
"""
stop = set(stopwords.words('english'))

In [18]:
#adding "gt" to the stop words. this appears a lot with certain users bulleting their comment submissions
custom_stop_words = ["gt", "like", "com", "http", "https", "www"]
stop.update(custom_stop_words)

In [19]:
conn = psycopg2.connect(host=db_host, port=db_port, user=db_user, password=db_password, dbname=db_name)
cur = conn.cursor()

In [20]:
cur.execute(sql)
query = cur.fetchall()
comments = []
bot_status = []

In [21]:
for i in range(len(query)):
    comments.append(query[i][0])
    bot_status.append(query[i][1])

In [22]:
try:
    with open('comment_features_weights.pkl', 'rb') as f:
        feature_weight_list = pickle.load(f)
    cv = CountVectorizer(stop_words=stop, max_df=0.5, min_df=1, lowercase=True, ngram_range=(1,1), strip_accents='ascii', vocabulary=feature_weight_list)
    print("Using saved feature weights")
except:
    cv = CountVectorizer(stop_words=stop, max_df=0.5, min_df=1, lowercase=True, ngram_range=(1,1), strip_accents='ascii', max_features=10000)
    print("Using full corpus")

Using full corpus


In [23]:
word_count_vector = cv.fit_transform(comments)
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)
tfidf_vector = tfidf_transformer.transform(word_count_vector)
feature_names = cv.get_feature_names_out()
df = pd.DataFrame(tfidf_vector.toarray(), columns=feature_names)
df['is_bot'] = bot_status

In [24]:
X = df.drop(['is_bot'], axis=1)
y = df['is_bot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

def pipe_maker(classifier):
    pipe = Pipeline([('clf', classifier)])
    return pipe
def gridsearch_maker(pipeline, params):
    grid = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='f1_weighted')
    return grid
def find_best_recall(gridsearch):
    gridsearch.fit(X_train, y_train)
    print('Best score:', gridsearch.best_score_)
    print('Best parameters:', gridsearch.best_params_)
    print('Best estimator:', gridsearch.best_estimator_)
    return gridsearch.best_estimator_

In [26]:
pipe_etclf = pipe_maker(RandomForestClassifier())
params_etclf = {'clf__n_estimators': [60, 80, 100, 120, 140, 160, 180, 200],
                'clf__criterion': ['gini', 'entropy'],
                'clf__max_features': ['auto'],
                'clf__max_depth': [None]}

In [27]:
# best_reg_params = find_best_recall(gridsearch_maker(pipe_etclf, params_etclf))

In [28]:
clf = RandomForestClassifier(n_estimators=180, criterion='entropy', max_features='auto', max_depth=None, n_jobs=-1)
#build random forrest based off of best params
# clf = best_reg_params

In [29]:
clf.fit(X_train, y_train)
print('Training set score: {:.3f}'.format(clf.score(X_train, y_train)))
print('Test set score: {:.3f}'.format(clf.score(X_test, y_test)))
print(classification_report(y_test, clf.predict(X_test)))
print(confusion_matrix(y_test, clf.predict(X_test)))

Training set score: 0.993
Test set score: 0.651
              precision    recall  f1-score   support

           0       0.61      0.78      0.69      1024
           1       0.72      0.53      0.61      1089

    accuracy                           0.65      2113
   macro avg       0.67      0.66      0.65      2113
weighted avg       0.67      0.65      0.65      2113

[[802 222]
 [515 574]]


In [30]:
(print(len(X.columns)))
column_names = X.columns
weights = clf.feature_importances_
#make df of all features and their weights with weight larger than 0
df_weights = pd.DataFrame(columns=['feature', 'weight'])
for i in range(len(weights)):
    if weights[i] > 0.00001:
        df_weights = df_weights.append({'feature': column_names[i], 'weight': weights[i]}, ignore_index=True)
df_weights = df_weights.sort_values(by='weight', ascending=False)

10000


In [31]:
len(df_weights)

5731

In [32]:
df_weights.head(25)

Unnamed: 0,feature,weight
1296,crypto,0.006101
5104,thanks,0.005162
5150,tie,0.005117
5669,would,0.004805
3404,news,0.004647
3520,one,0.004436
299,amp,0.004055
5281,trump,0.004054
3686,people,0.003966
2158,get,0.003817


In [33]:
# with open('comment_features_weights.pkl', 'wb') as f:
#     pickle.dump(df_weights["feature"], f)

In [34]:
#export clf to pickle file
# with open('clf_v2.pkl', 'wb') as f:
#     pickle.dump(clf, f)

In [35]:
y.value_counts()

0    5306
1    5259
Name: is_bot, dtype: int64