In [43]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
from _keys import db_user, db_password, db_name, db_host, db_port
import psycopg2
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
stop=set(stopwords.words('english'))
import pickle

In [44]:
with open('rf_model.pkl', 'rb') as f:
    original_model = pickle.load(f)
with open('feature_names.pkl', 'rb') as f:
    origin_feature_names = pickle.load(f)

In [45]:
origin_feature_weights = original_model.feature_importances_
#find feature weights not in stopwords and with a weight larger than 0
main_features = []
for i in range(len(origin_feature_weights)):
    if origin_feature_names[i] not in stop and origin_feature_weights[i] > 0:
        main_features.append(origin_feature_names[i])

In [46]:
conn = psycopg2.connect(dbname=db_name, user=db_user, password=db_password, host=db_host, port=db_port)
sql = """
select *
	from (
		select author, title, 1 as is_bot, id
		from sus_user_posts
		where author in (select distinct author from sus_user_posts) and subreddit in (select subreddit from relevant_subreddit_info where is_relevant = 'yes')
		)as posts_aggregate
union
select *
	from (
		select author, title, 0 as is_bot, id
		from norm_user_posts
		where author in (select distinct author from norm_user_posts limit 1500) and subreddit in (select subreddit from relevant_subreddit_info where is_relevant = 'yes')
		)as norm_agg
"""

In [47]:
cur = conn.cursor()
cur.execute(sql)
output = cur.fetchall()

In [48]:
authors = []
posts = []
bot_status = []
post_ids = []
for i in range(len(output)):
    authors.append(output[i][0])
    posts.append(output[i][1])
    bot_status.append(output[i][2])
    post_ids.append(output[i][3])

In [49]:
cv = CountVectorizer(min_df=1, vocabulary=main_features)
word_count_vector = cv.fit_transform(posts)
tfid_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfid_transformer.fit(word_count_vector)
tf_idf_vectors = tfid_transformer.transform(word_count_vector)
feature_names = cv.get_feature_names_out()
df = pd.DataFrame(tf_idf_vectors.T.todense(), index=feature_names, columns=post_ids)
df = df.T
df['___author'] = authors
df['___bot_status'] = bot_status

In [50]:
df = df.reset_index()
df = df.rename(columns={'index':'post_id'})

In [51]:
X = df.drop(['___author', '___bot_status', "post_id"], axis=1)
y = df['___bot_status']

In [52]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [53]:
# clf = RandomForestClassifier(n_estimators=50, criterion='entropy', max_features='log2', max_depth=None)

In [54]:
# clf.fit(X_train, y_train)
# print('Training set score: {:.3f}'.format(clf.score(X_train, y_train)))
# print('Test set score: {:.3f}'.format(clf.score(X_test, y_test)))
# print(classification_report(y_test, clf.predict(X_test)))
# print(confusion_matrix(y_test, clf.predict(X_test)))

In [55]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

def pipe_maker(classifier):
    pipe = Pipeline([('clf', classifier)])
    return pipe
def gridsearch_maker(pipeline, params):
    grid = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='f1_weighted')
    return grid
def find_best_recall(gridsearch):
    gridsearch.fit(X_train, y_train)
    print('Best score:', gridsearch.best_score_)
    print('Best parameters:', gridsearch.best_params_)
    print('Best estimator:', gridsearch.best_estimator_)
    return gridsearch.best_estimator_

In [56]:
pipe_etclf = pipe_maker(ExtraTreesClassifier(),)
params_etclf = {'clf__n_estimators': [40],
                'clf__criterion': ['entropy'],
                'clf__max_features': ['sqrt'],
                'clf__max_depth': [None],
                'clf__min_samples_split': [2, 5, 10, 20, 30, 40, 50],}

In [57]:
# find_best_recall(gridsearch_maker(pipe_etclf, params_etclf))

In [58]:
pipe_etclf = pipe_maker(RandomForestClassifier(),)
params_etclf = {'clf__n_estimators': [50],
                'clf__criterion': ['entropy'],
                'clf__max_features': ['sqrt', 'log2', 'int', 'float', None,],
                'clf__max_depth': [None],
                'clf__min_samples_split': [2],
                'clf__min_samples_leaf': [1],}

In [59]:
# find_best_recall(gridsearch_maker(pipe_etclf, params_etclf))

In [60]:
extra_clf = ExtraTreesClassifier(n_estimators=40, criterion='entropy', max_features='sqrt', max_depth=None)
rand_clf = RandomForestClassifier(n_estimators=50, criterion='entropy', max_features='log2', max_depth=None, min_samples_split=2, min_samples_leaf=1)

In [61]:
def regtest(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    clf.fit(X_train, y_train)
    print('Training set score: {:.3f}'.format(clf.score(X_train, y_train)))
    print('Test set score: {:.3f}'.format(clf.score(X_test, y_test)))
    print(classification_report(y_test, clf.predict(X_test)))
    print(confusion_matrix(y_test, clf.predict(X_test)))
    return clf

In [62]:
regtest(extra_clf, X, y)

Training set score: 0.998
Test set score: 0.784
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1571
           1       0.79      0.81      0.80      1764

    accuracy                           0.78      3335
   macro avg       0.78      0.78      0.78      3335
weighted avg       0.78      0.78      0.78      3335

[[1187  384]
 [ 336 1428]]


ExtraTreesClassifier(criterion='entropy', max_features='sqrt', n_estimators=40)

In [63]:
new_rand_clf = regtest(rand_clf, X, y)

Training set score: 0.998
Test set score: 0.785
              precision    recall  f1-score   support

           0       0.78      0.77      0.77      1571
           1       0.79      0.80      0.80      1764

    accuracy                           0.79      3335
   macro avg       0.78      0.78      0.78      3335
weighted avg       0.79      0.79      0.79      3335

[[1204  367]
 [ 349 1415]]


In [64]:
with open('new_rand_clf_model.pkl', 'wb') as f:
    pickle.dump(new_rand_clf, f)

In [65]:
feature_names = list(X.columns)
with open('new_rand_clf_model_feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

In [66]:
print(new_rand_clf)

RandomForestClassifier(criterion='entropy', max_features='log2',
                       n_estimators=50)
