In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from simpletransformers.classification import ClassificationModel, ClassificationArgs

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils import class_weight

In [21]:
# Data with Throughput & WorkTime
df = pd.read_csv('./data/kokil dec 6 reprepare/conf_pc_worker_sem.csv')
df = df.dropna()

In [22]:
y = df["Input.deception_quadrant"].copy()
X = df[["Input.deception_quadrant"]]
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
splits_generator = sss.split(X, y)

for train_idx, test_idx in splits_generator:
    indices_train = train_idx
    indices_test = test_idx

train = df.take(indices_train)
test = df.take(indices_test)

In [23]:
new_deception_train = train["Input.deception_quadrant"].copy()
new_deception_train['Input.deception_quadrant'] = train["Input.deception_quadrant"].apply(lambda x : 1 if x == "Straightforward" else 0)
train['Input.deception_quadrant'] = train["Input.deception_quadrant"].apply(lambda x : 1 if x == "Straightforward" else 0)
y_train_deception = new_deception_train['Input.deception_quadrant'].to_numpy()
deception_class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train_deception),
                                                 y_train_deception)



In [24]:
X_train = train[['Input.full_text', 'Input.deception_quadrant' ]]
X_test = test[['Input.full_text', 'Input.deception_quadrant' ]]

In [25]:
X_train = X_train.rename(columns={'Input.full_text': 'text', 'Input.deception_quadrant': 'labels'})
X_test = X_test.rename(columns={'Input.full_text': 'text', 'Input.deception_quadrant': 'labels'})

In [26]:
X_train['labels'] = X_train["labels"].apply(lambda x : x)

X_train

Unnamed: 0,text,labels
31,The reason is that: if I help you to Marseille...,1
1787,I figured this split would give you land and s...,1
3921,I understand your concern and will not move St...,1
4319,I believe Spa NC is naturally yours and have n...,1
1513,"If you don't want a rebuild in Brest, we'll si...",1
...,...,...
122,Even if me and France are high-fiving in Bel f...,1
10923,"To give you my perspective, when I play German...",1
5643,There's a small chance you can just do A Hol-K...,1
8943,Let's be aggressive with france so we can swit...,1


In [27]:
# Model type get from : https://simpletransformers.ai/docs/classification-specifics/
# Model name get from huggingface 
# bert / bert-base-cased ; distilbert/ distilbert-base-cased ; xlnet / xlnet-base-cased; roberta / roberta-base-cased

model = ClassificationModel(
    model_type = 'distilbert', 
    model_name = 'distilbert-base-cased',
    weight=list(deception_class_weights),
    args={'reprocess_input_data': True,
          'use_cuda':True,
          'num_train_epochs':1,
          'learning_rate': 3e-5,
          'train_batch_size': 1,
          'overwrite_output_dir': True
         }
)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=411.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=263273408.0), HTML(value='')))




Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=213450.0), HTML(value='')))




In [28]:
model.train_model(X_train)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9092.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 1'), FloatProgress(value=0.0, max=9092.0), HTML(value='')))







(9092, 0.3217317360441211)

In [29]:
predictions, raw_outputs = model.predict(X_test['text'].tolist())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2274.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=285.0), HTML(value='')))




In [30]:
X_test

Unnamed: 0,text,labels
7269,A combination of France in Spain (South coast)...,Straightforward
3126,Thats what Im suspecting is about to happen to...,Straightforward
11054,"Well, BOT is probably going to Sweden, so that...",Straightforward
6649,The last message before your hello was me aski...,Straightforward
6274,I have an ask for you but idk how realistic it is,Straightforward
...,...,...
9360,I know you didn't say anything about sev to ar...,Straightforward
9378,"Anyways, I know that we mutually have absolutl...",Cassandra
5575,But it's pretty obvious now looking at it.,Straightforward
304,"However, remember that thing I said about gene...",Straightforward


In [31]:
0 in predictions

False