In [1]:
from _keys import db_user, db_password, db_name, db_host, db_port
import psycopg2
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# lemmatokenizer

from nltk.tokenize import word_tokenize
stop=set(stopwords.words('english'))

In [2]:
sql ="""
select *
	from (
		select author, lower(string_agg(title,'')) as corpus, 1 as is_bot
		from sus_user_posts
		where author in (select distinct author from sus_user_posts limit 500) and subreddit !='u_reddit'
		group by sus_user_posts.author
		having length(lower(string_agg(title,''))) >= 50)
		as posts_aggregate
	union
select *
	from (
		select author, lower(string_agg(title,'')) as corpus, 0 as is_bot
		from norm_user_posts
		where author in (select distinct author from norm_user_posts limit 500) and subreddit !='u_reddit'
		group by norm_user_posts.author
		having length(lower(string_agg(title,''))) >= 50)
		as norm_aggregate
"""

In [3]:
corpus_sql = """
select string_agg(corpus, '') as corpus from(
select *
	from (
		select author, lower(string_agg(title,'')) as corpus, 1 as is_bot
		from sus_user_posts
		where author in (select distinct author from sus_user_posts limit 500) and subreddit !='u_reddit'
		group by sus_user_posts.author
		having length(lower(string_agg(title,''))) >= 50)
		as posts_aggregate
	union
select *
	from (
		select author, lower(string_agg(title,'')) as corpus, 0 as is_bot
		from norm_user_posts
		where author in (select distinct author from norm_user_posts limit 500) and subreddit !='u_reddit'
		group by norm_user_posts.author
		having length(lower(string_agg(title,''))) >= 50)
		as norm_aggregate
	) as foo
"""

In [4]:
conn = psycopg2.connect(dbname=db_name, user=db_user, password=db_password, host=db_host, port=db_port)
posts_df = pd.read_sql(sql, conn)

In [5]:
cur = conn.cursor()
cur.execute(corpus_sql)
overall_corpus = cur.fetchall()[0][0]

In [6]:
def tfid_vectorize(row):
    try:
        corpus = row['corpus']
        author_name = row['author']
        is_bot_status = row['is_bot'].astype(int)
        lem = WordNetLemmatizer()
        lemmatized_words = []
        tokens = word_tokenize(corpus)
        for token in tokens:
            if token not in stop:
                lemmatized_words.append(lem.lemmatize(token))
        vectorizer = TfidfVectorizer(use_idf=True, min_df=1)
        # vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, min_df=1)
        vectors = vectorizer.fit_transform(lemmatized_words)
        vector_0 = vectors[0]
        tfids = pd.DataFrame(vector_0.T.todense(), index=vectorizer.get_feature_names_out(), columns=['tfid'])
        tfids_transposed = tfids.transpose()
        tfids_transposed['author'] = author_name
        tfids_transposed['is_bot'] = is_bot_status
        return tfids_transposed
    except Exception as e:
        print(f'{e} --- {row}')
        next


In [7]:
def initialize_tfidf_df(master_corpus):
    data = {'author': ["DUMMY"], 'is_bot': [0.0], 'corpus': master_corpus}
    df = pd.DataFrame(data=data, index = [0])
    df = tfid_vectorize(df.iloc[0])
    df = df.drop(df.index[0])
    return df

In [8]:
df = initialize_tfidf_df(overall_corpus)

In [9]:
for i in range(0,len(posts_df)):
    df = df.append(tfid_vectorize(posts_df.iloc[i]))

empty vocabulary; perhaps the documents only contain stop words --- author                                         Clockthewhat
corpus    what is she up to?what is she up to?what is sh...
is_bot                                                    0
Name: 99, dtype: object


In [10]:
df = df.fillna(0)
X = df.drop(['author', 'is_bot'], axis=1)
y = df['is_bot']

In [52]:
#find columns in X where they have duplicate values
duplicate_columns = [col for col in X.columns if X[col].nunique() < len(X[col])]


In [57]:
X.shape

(639, 27370)

In [12]:
y.value_counts()

0    499
1    140
Name: is_bot, dtype: int64

In [13]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)   


In [15]:
y_test.value_counts()

0    199
1     57
Name: is_bot, dtype: int64

In [23]:
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
clf.fit(X_train, y_train)
# print(clf.score(X_train, y_train))
# print(clf.score(X_test, y_test))
print(confusion_matrix(y_test, clf.predict(X_test), labels=[0,1]))
print(classification_report(y_test, clf.predict(X_test), zero_division=0))

[[188  11]
 [ 50   7]]


In [42]:
#import tensor flow for a neuran network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [49]:
nn_model = tf.keras.Sequential()
nn_model.add(layers.Dense(64, activation='relu', input_shape=(X.shape[1],)))
nn_model.add(layers.Dense(10, activation='relu'))
nn_model.add(layers.Dense(1, activation='sigmoid'))
nn_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                1751744   
_________________________________________________________________
dense_4 (Dense)              (None, 10)                650       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 1,752,405
Trainable params: 1,752,405
Non-trainable params: 0
_________________________________________________________________


In [50]:
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [51]:
fit_model = nn_model.fit(X_train, y_train, epochs=100)

RuntimeError: Data adapters should be mutually exclusive for handling inputs. Found multiple adapters [<class 'keras.engine.data_adapter.TensorLikeDataAdapter'>, <class 'keras.engine.data_adapter.GeneratorDataAdapter'>] to handle input: <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.series.Series'>