In [1]:
# Prerequisites
from multiprocessing import Pool
import pandas as pd
import pickle
import threading
from Levenshtein import distance as lev
import itertools
from sklearn.cluster import DBSCAN
import json
import sys
import dateutil
import pkg_resources
import numpy as np
from dateutil.relativedelta import relativedelta
from datetime import datetime
try:
    from urllib.request import urlopen, Request
except ImportError:
    from urllib2 import urlopen, Request
import argparse
from tqdm import tqdm

from functools import lru_cache, partial
lru_cache = lru_cache(maxsize=None)
import tmp

## Text Process and Feature Production

In [2]:
def tokenizer(text):
    """
    Separate the words in a given string
    """
    return text.split(' ')

In [3]:
#3
@lru_cache
def jaccard(x, y):
    """
    To tokenize text and compute jaccard disatnce
    """
    x_w = set(tokenizer(x))
    y_w = set(tokenizer(y))
    return (
        len(x_w.symmetric_difference(y_w)) / (len(x_w.union(y_w)) if len(x_w.union(y_w)) > 0 else 1)
    )

In [4]:
#4
@lru_cache
def levenshtein(x, y, n=None):
    """
    Obtain levenshtein distance for the given text
    """
    if n is not None:
        x = x[:n]
        y = y[:n]
    return lev(x, y) / (max(len(x), len(y)) if max(len(x), len(y)) > 0 else 1)

In [5]:
#5
def average_jac_lev(x, y):
    """
    Computes average of jacard and levenshtein for 2 given strings
    """
#     print('entered jaccard')
    return (jaccard(x, y) + levenshtein(x, y)) / 2

In [6]:
#6
def compute_distance(items, distance):
    """
    Computes a distance matrix for given items, using given distance function
    """
    m = np.zeros((len(items), len(items)))
    enumitems = list(enumerate(items))
    for xe, ye in itertools.combinations(enumitems, 2):
        i, x = xe
        j, y = ye
        d = distance(x, y)
        m[i, j] = m[j, i] = d
    return m

In [7]:
#7
def gini(array):
    """
    Calculate the Gini coefficient of a numpy array
    """
    if len(array) == 0:
        return 0
    array = array.flatten()
    if np.amin(array) < 0:
        array -= np.amin(array)
    array += 0.0000001
    array = np.sort(array)
    index = np.arange(1, array.shape[0] + 1)
    n = array.shape[0]
    return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))

## 

In [8]:
#8
def task(data):
    """
    Threading and Progress
    """
#     print('entered task')
    author, group, max_comments, params = data
    group = group[:max_comments]
    clustering = DBSCAN(eps=params['eps'], min_samples=1, metric='precomputed')
    items = compute_distance(getattr(group, params['source']), params['func'])
    clusters = clustering.fit_predict(items)
    empty_comments = np.count_nonzero(group['empty'])

    return (
        author,
        len(group),
        empty_comments,
        len(np.unique(clusters)),
        gini(items[np.tril(items).astype(bool)]),
    )

In [9]:
#9
def run_function_in_thread(pbar, function, max_value, args=[], kwargs={}):
#     print('entered function in thread')
    ret = [None]

    def myrunner(function, ret, *args, **kwargs):
        ret[0] = function(*args, **kwargs)

    thread = threading.Thread(target=myrunner, args=(function, ret) + tuple(args), kwargs=kwargs)
    thread.start()
    while thread.is_alive():
        thread.join(timeout=.1)
        if(pbar.n < max_value - .3):
            pbar.update(.1)
    pbar.n = max_value
    return ret[0]

In [10]:
#10
def get_model():
#     print('entered get_model')
    path = 'model.json'
    filename = pkg_resources.resource_filename(__name__, path)
    with open(filename, 'rb') as file:
        model = pickle.load(file)
    return model

In [11]:
#11
# get the df collected as comments
comments = pd.read_csv(r'./../Data_frames/diem_diem.csv', index_col = 0)

In [12]:
#12
min_comments = 10
max_comments = 100
exclude = [] # exclude these accounts
accounts = ['JoshLind', 'stale[bot]', 'libra-action', 'dimroc'] # consider only these accounts

In [13]:
#13
'''

To get the top 100 comments for each author in the repository

'''

df = (
    comments
    [comments['author'].isin(
    comments
    .groupby('author', as_index = False)
    .count()[lambda x: x['body'] >= min_comments]['author'].values
    )]
    .sort_values('created_at', ascending = False)
    .groupby('author').head(max_comments)
)

In [14]:
#14
'''

To either exclude or include few accounts and to check if there is at least one account to execute the machine learning model

'''

if len(exclude) > 0:
    df = df[~df['author'].isin(exclude)]

if len(accounts) > 0:
    df = df[lambda x: x['author'].isin(accounts)]

if(len(df) < 1):
    print("At least 10 comments are required for each account to predict the type of the account")

In [15]:
#15
'''

To calculate the jaccard and levenshtein for any pair of comments made by the author 

'''
inputs = []
for author, group in df.groupby('author'):
    inputs.append(
        (
            author,
            group.copy(),
            max_comments,
            {'func': average_jac_lev, 'source': 'body', 'eps': 0.5}
        )
    )

In [16]:
#16
def task(data):
    """
    Threading and Progress
    """
#     print('entered task')
    author, group, max_comments, params = data
    group = group[:max_comments]
    clustering = DBSCAN(eps=params['eps'], min_samples=1, metric='precomputed')
    items = compute_distance(getattr(group, params['source']), params['func'])
    clusters = clustering.fit_predict(items)
    empty_comments = np.count_nonzero(group['empty'])

    return (
        author,
        len(group),
        empty_comments,
        len(np.unique(clusters)),
        gini(items[np.tril(items).astype(bool)]),
    )

inputs = []
for author, group in df.groupby('author'):
    inputs.append(
        (
            author,
            group.copy(),
            max_comments,
            {'func': tmp.average_jac_lev, 'source': 'body', 'eps': 0.5}
        )
    )

data = []
with Pool() as pool:
    for result in tqdm(
        pool.imap_unordered(tmp.task, inputs),
        total=len(inputs)):
        
        data.append(result)

result = pd.DataFrame(
        data=data, columns=['account', 'comments', 'empty comments', 'patterns', 'dispersion'])
display(result)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.43s/it]


Unnamed: 0,account,comments,empty comments,patterns,dispersion
0,stale[bot],39,0,1,0.0
1,dimroc,17,0,7,0.03417
2,JoshLind,43,0,28,0.044171
3,libra-action,58,0,18,0.093688


In [18]:
# data = []
# for item in inputs:
#     for result in tqdm(
#         task(item),
#         total=len(inputs)):

#         data.append(result)

5it [00:00, 65948.18it/s]                                                       
5it [00:00, 71331.70it/s]                                                       
5it [00:00, 58092.85it/s]                                                       
5it [00:00, 64927.31it/s]                                                       


In [17]:
#17
prediction_progress = tqdm(
        total=25, smoothing=.1, bar_format='{desc}: {percentage:3.0f}%|{bar}', leave=False)
tasks = ['Loading model', 'Making prediction', 'Exporting result']
prediction_progress.set_description(tasks[0])
model = run_function_in_thread(prediction_progress, get_model, 5)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [18]:
#18
result = (
        result
        .assign(
            prediction=lambda x: np.where(model.predict(
                x[['comments', 'empty comments', 'patterns', 'dispersion']]) == 1, 'Bot', 'Human')
        )
    )
del model



In [19]:
#19
result = (
            result
            .set_index('account')
            [['comments', 'empty comments', 'patterns', 'dispersion', 'prediction']]
        )
prediction_progress.close()

                                                                                                                                                                             

In [20]:
result

Unnamed: 0_level_0,comments,empty comments,patterns,dispersion,prediction
account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
stale[bot],39,0,1,0.0,Bot
dimroc,17,0,7,0.03417,Human
JoshLind,43,0,28,0.044171,Human
libra-action,58,0,18,0.093688,Bot
