In [21]:
#Dependencies
import pandas as pd
import psycopg2
from keys import db_user, db_password, db_name, db_host, db_port
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle

In [22]:
sql = """
select title as "text", 1 as "is_bot" from sus_user_posts
union
select title as "text", 0 as "is_bot" from (
	select * from 
	norm_user_posts order by random() limit 9000) as foo
"""
stop = set(stopwords.words('english'))

In [23]:
conn = psycopg2.connect(host=db_host, port=db_port, user=db_user, password=db_password, dbname=db_name)
cur = conn.cursor()

In [24]:
cur.execute(sql)
query = cur.fetchall()
posts = []
bot_status = []

In [25]:
for i in range(len(query)):
    posts.append(query[i][0])
    bot_status.append(query[i][1])

In [26]:
try:
    with open('features_weights.pkl', 'rb') as f:
        feature_weight_list = pickle.load(f)
    cv = CountVectorizer(stop_words=stop, max_df=0.5, min_df=1, lowercase=True, ngram_range=(1,1), strip_accents='ascii', vocabulary=feature_weight_list)
    print("Using saved feature weights")
except:
    cv = CountVectorizer(stop_words=stop, max_df=0.5, min_df=1, lowercase=True, ngram_range=(1,1), strip_accents='ascii')
    print("Using full corpus")

Using saved feature weights


In [27]:
# cv = CountVectorizer(stop_words=stop, max_df=0.5, min_df=1, lowercase=True, ngram_range=(1,1), strip_accents='ascii')

In [28]:
word_count_vector = cv.fit_transform(posts)
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)
tfidf_vector = tfidf_transformer.transform(word_count_vector)
feature_names = cv.get_feature_names_out()
df = pd.DataFrame(tfidf_vector.toarray(), columns=feature_names)
df['is_bot'] = bot_status



In [29]:
X = df.drop(['is_bot'], axis=1)
y = df['is_bot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

def pipe_maker(classifier):
    pipe = Pipeline([('clf', classifier)])
    return pipe
def gridsearch_maker(pipeline, params):
    grid = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='f1_weighted')
    return grid
def find_best_recall(gridsearch):
    gridsearch.fit(X_train, y_train)
    print('Best score:', gridsearch.best_score_)
    print('Best parameters:', gridsearch.best_params_)
    print('Best estimator:', gridsearch.best_estimator_)
    return gridsearch.best_estimator_

In [31]:
pipe_etclf = pipe_maker(RandomForestClassifier())
params_etclf = {'clf__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                'clf__criterion': ['gini', 'entropy'],
                'clf__max_features': ['auto', 'sqrt', 'log2'],
                'clf__max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}

In [32]:
# find_best_recall(gridsearch_maker(pipe_etclf, params_etclf))

In [33]:
clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_features='log2', max_depth=None, n_jobs=-1)

In [34]:
clf.fit(X_train, y_train)
print('Training set score: {:.3f}'.format(clf.score(X_train, y_train)))
print('Test set score: {:.3f}'.format(clf.score(X_test, y_test)))
print(classification_report(y_test, clf.predict(X_test)))
print(confusion_matrix(y_test, clf.predict(X_test)))

Training set score: 0.991
Test set score: 0.768
              precision    recall  f1-score   support

           0       0.74      0.83      0.78      1743
           1       0.81      0.70      0.75      1711

    accuracy                           0.77      3454
   macro avg       0.77      0.77      0.77      3454
weighted avg       0.77      0.77      0.77      3454

[[1454  289]
 [ 514 1197]]


In [35]:
(print(len(X.columns)))
column_names = X.columns
weights = clf.feature_importances_
#make df of all features and their weights with weight larger than 0
df_weights = pd.DataFrame(columns=['feature', 'weight'])
for i in range(len(weights)):
    if weights[i] > 0.00001:
        df_weights = df_weights.append({'feature': column_names[i], 'weight': weights[i]}, ignore_index=True)
df_weights = df_weights.sort_values(by='weight', ascending=False)

9936


In [36]:
len(df_weights)

7157

In [37]:
df_weights.head(15)

Unnamed: 0,feature,weight
0,police,0.009684
1,cops,0.007637
2,cop,0.00744
3,black,0.006825
4,america,0.006773
6,clinton,0.005389
5,trump,0.005232
9,man,0.004115
7,hillary,0.004111
8,american,0.004022


In [38]:
# with open('features_weights_v2.pkl', 'wb') as f:
#     pickle.dump(df_weights["feature"], f)

In [39]:
#export clf to pickle file
with open('clf_v2.pkl', 'wb') as f:
    pickle.dump(clf, f)