## Text Features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

class FreqDist(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        arr = []
        for x in X:
            vec = CountVectorizer()
            freqs = vec.fit_transform([x]).toarray()
            arr.append({
                'feature_names': vec.get_feature_names_out(),
                'frequencies': freqs
            })
        return arr

def lexical_diversity(freq_dist):
    return len(freq_dist['feature_names']) / freq_dist['frequencies'].sum()

def repetition_score(freq_dist):
    # Compute the frequency steepness - more steepness means more repetition
    freqs = freq_dist['frequencies']
    return (freqs.max() - freqs.min()) / freqs.mean()

## Training Pipeline

## Load Data

In [None]:
import pandas as pd

train_df = pd.read_csv('datasets/final_train.csv')
test_df = pd.read_csv('datasets/final_test.csv')

# Preprocess

In [None]:
def filter_short(text):
    return len(text) > 10

train_df = train_df[train_df['text'].apply(filter_short)]
test_df = test_df[test_df['text'].apply(filter_short)]

X_train = train_df['text']
y_train = train_df['label']
X_test = test_df['text']
y_test = test_df['label']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
y_train.value_counts()

In [None]:
y_train.value_counts()
y_test.value_counts()

## Model bake-off

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from yellowbrick.classifier import ClassificationReport
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from yellowbrick.classifier import ConfusionMatrix
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV

pipeline = Pipeline([
    ('freq_dist', FreqDist()),
    ('union', FeatureUnion(
        transformer_list=[
            ('lexical_diversity', FunctionTransformer(lambda x: [[lexical_diversity(freq_dist)] for freq_dist in x])),
            ('repetition_score', FunctionTransformer(lambda x: [[repetition_score(freq_dist)] for freq_dist in x])),
        ]
    )),
])

candidates = [
    #LogisticRegression(random_state=42),
    #MultinomialNB(),
    DecisionTreeClassifier(random_state=42),
    #CalibratedClassifierCV(LinearSVC(random_state=42, dual='auto')),
]

models = {}
for model in candidates:
    pipeline.steps.append(('model', model))
    visualizer = ClassificationReport(pipeline, classes=['human', 'auto'], support=True)
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.show()
    models[model.__class__.__name__] = pipeline
print(models.keys())

In [None]:
models['DecisionTreeClassifier'].steps[-1][1].classes_

## Save the best model

In [None]:
import pickle

with open('autogen.pkl', 'wb') as f:
    m = models['DecisionTreeClassifier'].steps[-1][1]
    pickle.dump(m, f)

## Publish to Ensign

In [None]:
import json
import pickle

from pyensign.ensign import Ensign
from pyensign.events import Event

ensign = Ensign(cred_path='../.streamlit/config.json')
model = models['DecisionTreeClassifier'].steps[-1][1]
meta = {
    'model_class': model.__class__.__name__,
    'train_size': str(X_train.shape[0]),
    'test_size': str(X_test.shape[0]),
    'classes': json.dumps(['human', 'auto']),
}
event = Event(pickle.dumps(model), mimetype="application/python-pickle", schema_name='sklearn-model', schema_version="0.1.0", meta=meta)
#await ensign.publish("autogen-model", event)

In [None]:
cursor = await ensign.query("SELECT * FROM autogen-model")
async for message in cursor:
    print(message)

In [None]:
pipeline = Pipeline([
    ('freq_dist', FreqDist()),
    ('union', FeatureUnion(
        transformer_list=[
            ('lexical_diversity', FunctionTransformer(lambda x: [[lexical_diversity(freq_dist)] for freq_dist in x])),
            ('repetition_score', FunctionTransformer(lambda x: [[repetition_score(freq_dist)] for freq_dist in x])),
        ]
    )),
    #('scaler', StandardScaler()),
])

model_data = pickle.loads(event.data)
print(pipeline.transform(['This is some text']))
pipeline.steps.append(('model', model_data))
classes = json.loads(event.meta['classes'])
classes