### Import libraries

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from simpletransformers.classification import ClassificationModel
import sklearn
import itertools
import emoji

In [25]:
df = pd.read_csv("./train.csv")

In [26]:
columns_drop = ['keyword','location']

In [27]:
df.drop(columns=columns_drop,inplace=True)

In [28]:
df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [73]:
fake_tweets = df[df.target == 0]
fake_tweets.shape

(4342, 3)

In [76]:
fake_tweets.head(300)

Unnamed: 0,id,text,target
15,23,,0
16,24,fruits,0
17,25,Summer lovely,0
18,26,,0
19,28,gooaal,0
20,31,ridiculous,0
21,32,London,0
22,33,skiing,0
23,34,wonderful,0
24,36,LOOL,0


### Defining contractions to clean the data

In [29]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"thx"   : "thanks"
}

In [30]:
def remove_contractions(text):
    return contractions[text.lower()] if text.lower() in contractions.keys() else text

In [31]:
df['text']=df['text'].apply(remove_contractions)
df.head()


Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


### Clean the dataset

In [32]:
def clean_dataset(text):
    # Remove hashtag while keeping hashtag text
    text = re.sub(r'#','', text)
    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    # Remove tickers
    text = re.sub(r'\$\w*', '', text)
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+','', text)
    text = re.sub(r'[ ]{2, }',' ',text)
    # Remove URL, RT, mention(@)
    text=  re.sub(r'http(\S)+', '',text)
    text=  re.sub(r'http ...', '',text)
    text=  re.sub(r'(RT|rt)[ ]*@[ ]*[\S]+','',text)
    text=  re.sub(r'RT[ ]?@','',text)
    text = re.sub(r'@[\S]+','',text)
    # Remove words with 4 or fewer letters
    text = re.sub(r'\b\w{1,4}\b', '', text)
    #&, < and >
    text = re.sub(r'&amp;?', 'and',text)
    text = re.sub(r'&lt;','<',text)
    text = re.sub(r'&gt;','>',text)
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text= ''.join(c for c in text if c <= '\uFFFF') 
    text = text.strip()
    # Remove misspelling words
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    # Remove emoji
    text = emoji.demojize(text)
    text = text.replace(":"," ")
    text = ' '.join(text.split()) 
    text = re.sub("([^\x00-\x7F])+"," ",text)
    # Remove Mojibake (also extra spaces)
    text = ' '.join(re.sub("[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a]", " ", text).split())
    return text

In [33]:
df['text'] =df['text'].apply(clean_dataset)
df.head()

Unnamed: 0,id,text,target
0,1,Deeds Reason earthquake ALLAH Forgive,1
1,4,Forest Ronge Canada,1
2,5,residents asked shelter place being notified o...,1
3,6,people receive wildfires evacuation orders Cal...,1
4,7,photo Alaska smoke wildfires pours school,1


In [34]:
df.shape

(7613, 3)

### Split the training and validation set

In [35]:
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(df['text'], df['target'], test_size=0.20, random_state=42)

In [36]:
train_df_clean = pd.concat([X_train_clean, y_train_clean], axis=1)
print("Shape of training data set: ", train_df_clean.shape)
print("View of data set: ", train_df_clean.head())

Shape of training data set:  (6090, 2)
View of data set:                                                     text  target
4996  Courageous honest analysis Atomic Hiroshima70 ...       1
3263          shame became engulfed flames boycottBears       0
4907  rescind medals honor given soldiers Massacre W...       1
2855  Worried about drought might affect Extreme Wea...       1
4716                           BlastPower PantherAttack       0


In [37]:
eval_df_clean = pd.concat([X_test_clean, y_test_clean], axis=1)
print("Shape of Eval data set: ", eval_df_clean.shape)

Shape of Eval data set:  (1523, 2)


### BERT Model Training

#### Set up the train arguments

In [38]:
train_args = {
    'evaluate_during_training': True,
    'logging_steps': 100,
    'num_train_epochs': 2,
    'evaluate_during_training_steps': 100,
    'save_eval_checkpoints': False,
    'train_batch_size': 32,
    'eval_batch_size': 64,
    'overwrite_output_dir': True,
    'fp16': False,
    'wandb_project': "visualization-demo"
}

In [39]:
model_BERT = ClassificationModel('bert', 'bert-base-cased', num_labels=2, use_cuda=True, cuda_device=0, args=train_args)

I0310 06:47:44.935707 140666048968448 configuration_utils.py:256] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at /home/priya/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.3d5adf10d3445c36ce131f4c6416aa62e9b58e1af56b97664773f4858a46286e
I0310 06:47:44.936987 140666048968448 configuration_utils.py:292] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddi

#### Train the model

In [40]:
model_BERT.train_model(train_df_clean, eval_df=eval_df_clean)

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=6090), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

I0310 06:47:49.057393 140666048968448 run_manager.py:924] system metrics and metadata threads started
I0310 06:47:49.059631 140666048968448 run_manager.py:933] checking resume status, waiting at most 10 seconds
I0310 06:47:49.199309 140666048968448 run_manager.py:951] resuming run from id: UnVuOnYxOmh4bHE0N3o0OnZpc3VhbGl6YXRpb24tZGVtbzpwZHdpdmVkaQ==
I0310 06:47:49.237039 140666048968448 run_manager.py:963] upserting run before process can begin, waiting at most 10 seconds
I0310 06:47:49.376586 140661941479168 run_manager.py:1048] saving patches
I0310 06:47:49.378525 140661941479168 run_manager.py:1052] saving pip packages
I0310 06:47:49.382028 140661941479168 run_manager.py:1054] initializing streaming files api
I0310 06:47:49.385227 140661941479168 run_manager.py:1061] unblocking file change observer, beginning sync with W&B servers


HBox(children=(IntProgress(value=0, description='Current iteration', max=191, style=ProgressStyle(description_…

Running loss: 0.698800

I0310 06:47:49.959861 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/config.yaml
I0310 06:47:50.076681 140662613411584 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/media/graph/graph_0_summary_f96f05d5.graph.json
I0310 06:47:50.077244 140662613411584 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json
I0310 06:47:50.078046 140662613411584 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-events.jsonl
I0310 06:47:50.078888 140662613411584 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-history.jsonl
I0310 06:47:50.080479 140662613411584 run_manager.py:67

Running loss: 0.423495

I0310 06:48:05.965145 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json


Running loss: 0.637896

I0310 06:48:17.968822 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-events.jsonl


Running loss: 0.527249

I0310 06:48:21.970003 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json


Running loss: 0.586445

I0310 06:48:27.038316 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-summary.json
I0310 06:48:27.052270 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-history.jsonl
I0310 06:48:32.940157 140666048968448 configuration_utils.py:118] Configuration saved in outputs/best_model/config.json
I0310 06:48:33.171964 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-summary.json
I0310 06:48:33.178357 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-history.jsonl
I0310 06:48:33.467083 140666048968448 modeling_utils.py:298] Model weights saved in outputs/best_model/pytorch_model.bin


Running loss: 0.465774

I0310 06:48:37.172971 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json


Running loss: 0.403124

I0310 06:48:48.175947 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-events.jsonl


Running loss: 0.487192

I0310 06:48:53.177196 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json


Running loss: 0.520999

I0310 06:49:05.627499 140666048968448 configuration_utils.py:118] Configuration saved in outputs/checkpoint-191-epoch-1/config.json


Running loss: 0.301745

I0310 06:49:06.145545 140666048968448 modeling_utils.py:298] Model weights saved in outputs/checkpoint-191-epoch-1/pytorch_model.bin
I0310 06:49:09.195023 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json
I0310 06:49:12.419087 140666048968448 configuration_utils.py:118] Configuration saved in outputs/best_model/config.json
I0310 06:49:12.938256 140666048968448 modeling_utils.py:298] Model weights saved in outputs/best_model/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Current iteration', max=191, style=ProgressStyle(description_…

Running loss: 0.357526

I0310 06:49:17.197129 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-summary.json
I0310 06:49:17.199818 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-history.jsonl
I0310 06:49:18.197350 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-events.jsonl


Running loss: 0.338364

I0310 06:49:23.198603 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-summary.json
I0310 06:49:23.199523 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-history.jsonl


Running loss: 0.182843

I0310 06:49:25.199118 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json


Running loss: 0.310647

I0310 06:49:41.203074 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json


Running loss: 0.389972

I0310 06:49:48.205022 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-events.jsonl


Running loss: 0.460994

I0310 06:49:57.207330 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json


Running loss: 0.303775

I0310 06:49:59.207922 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-summary.json
I0310 06:49:59.210373 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-history.jsonl
I0310 06:50:04.545636 140666048968448 configuration_utils.py:118] Configuration saved in outputs/best_model/config.json
I0310 06:50:04.808708 140666048968448 modeling_utils.py:298] Model weights saved in outputs/best_model/pytorch_model.bin


Running loss: 0.176256

I0310 06:50:05.209357 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-summary.json
I0310 06:50:05.210268 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-history.jsonl


Running loss: 0.207583

I0310 06:50:13.211642 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json


Running loss: 0.259083

I0310 06:50:19.213345 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-events.jsonl


Running loss: 0.410513

I0310 06:50:30.216488 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json


Running loss: 0.240525

I0310 06:50:33.871136 140666048968448 configuration_utils.py:118] Configuration saved in outputs/checkpoint-382-epoch-2/config.json


Running loss: 0.368610

I0310 06:50:34.118834 140666048968448 modeling_utils.py:298] Model weights saved in outputs/checkpoint-382-epoch-2/pytorch_model.bin
I0310 06:50:40.094752 140666048968448 configuration_utils.py:118] Configuration saved in outputs/config.json
I0310 06:50:40.601601 140666048968448 modeling_utils.py:298] Model weights saved in outputs/pytorch_model.bin
I0310 06:50:40.615105 140666048968448 run_manager.py:1068] shutting down system stats and metadata service


Training of bert model complete. Saved to outputs/.


I0310 06:50:41.254659 140666048968448 run_manager.py:1080] stopping streaming files and file change observer
I0310 06:50:41.326479 140662613411584 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-metadata.json
I0310 06:50:41.330187 140666048968448 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_104748-hxlq47z4/wandb-events.jsonl


#### check model performance on validation data

In [41]:
result, model_outputs, wrong_predictions = model_BERT.eval_model(eval_df_clean, acc=sklearn.metrics.accuracy_score)

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=1523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

{'mcc': 0.5915974149823142, 'tp': 466, 'tn': 755, 'fp': 119, 'fn': 183, 'eval_loss': 0.45270544787247974, 'acc': 0.8017071569271176}


### Roberta Model Training

#### Setup the model

In [43]:
model_Roberta = ClassificationModel('roberta', 'roberta-base', num_labels=2, use_cuda=True, cuda_device=0, args=train_args)

I0310 09:02:59.605605 140666048968448 filelock.py:274] Lock 140662742471512 acquired on /home/priya/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.a7ab0e5de2d8321d6d6a15b199110f2c99be72976b7d151423cb8d8c261a13b6.lock
I0310 09:02:59.606524 140666048968448 file_utils.py:479] https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json not found in cache or force_download set to True, downloading to /home/priya/.cache/torch/transformers/tmp1fjnwz2k


HBox(children=(IntProgress(value=0, description='Downloading', max=524, style=ProgressStyle(description_width=…

I0310 09:02:59.855488 140666048968448 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json in cache at /home/priya/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.a7ab0e5de2d8321d6d6a15b199110f2c99be72976b7d151423cb8d8c261a13b6
I0310 09:02:59.856231 140666048968448 file_utils.py:492] creating metadata file for /home/priya/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.a7ab0e5de2d8321d6d6a15b199110f2c99be72976b7d151423cb8d8c261a13b6
I0310 09:02:59.856696 140666048968448 filelock.py:318] Lock 140662742471512 released on /home/priya/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.a7ab0e5de2d8321d6d6a15b199110f2c99be72976b7d151423cb8d8c261a13b6.lock
I0310 09:02:59.857347 140666048968448 configuration_utils.py:256] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.

HBox(children=(IntProgress(value=0, description='Downloading', max=501200538, style=ProgressStyle(description_…

I0310 09:03:09.971490 140666048968448 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin in cache at /home/priya/.cache/torch/transformers/228756ed15b6d200d7cb45aaef08c087e2706f54cb912863d2efe07c89584eb7.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e
I0310 09:03:09.972035 140666048968448 file_utils.py:492] creating metadata file for /home/priya/.cache/torch/transformers/228756ed15b6d200d7cb45aaef08c087e2706f54cb912863d2efe07c89584eb7.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e
I0310 09:03:09.972424 140666048968448 filelock.py:318] Lock 140662731611776 released on /home/priya/.cache/torch/transformers/228756ed15b6d200d7cb45aaef08c087e2706f54cb912863d2efe07c89584eb7.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e.lock
I0310 09:03:09.972814 140666048968448 modeling_utils.py:461] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_mode

HBox(children=(IntProgress(value=0, description='Downloading', max=898823, style=ProgressStyle(description_wid…

I0310 09:03:13.771920 140666048968448 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json in cache at /home/priya/.cache/torch/transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
I0310 09:03:13.774041 140666048968448 file_utils.py:492] creating metadata file for /home/priya/.cache/torch/transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
I0310 09:03:13.776611 140666048968448 filelock.py:318] Lock 140662731609928 released on /home/priya/.cache/torch/transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b.lock
I0310 09:03:14.028954 140666048968448 filelock.py:274] Lock 140662731609928 acquired on /home/priya/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b

HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…

I0310 09:03:14.449236 140666048968448 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt in cache at /home/priya/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I0310 09:03:14.449947 140666048968448 file_utils.py:492] creating metadata file for /home/priya/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I0310 09:03:14.450859 140666048968448 filelock.py:318] Lock 140662731609928 released on /home/priya/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock
I0310 09:03:14.451366 140666048968448 tokenization_utils.py:501] loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache a

#### Train the model

In [44]:
model_Roberta.train_model(train_df_clean, eval_df=eval_df_clean)

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=6090), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

I0310 09:03:33.974737 140666048968448 run_manager.py:924] system metrics and metadata threads started
I0310 09:03:33.976883 140666048968448 run_manager.py:933] checking resume status, waiting at most 10 seconds
I0310 09:03:34.157704 140666048968448 run_manager.py:951] resuming run from id: UnVuOnYxOjEzMDF4dnUwOnZpc3VhbGl6YXRpb24tZGVtbzpwZHdpdmVkaQ==
I0310 09:03:34.181163 140666048968448 run_manager.py:963] upserting run before process can begin, waiting at most 10 seconds
I0310 09:03:34.322329 140661933086464 run_manager.py:1048] saving patches
I0310 09:03:34.324260 140661933086464 run_manager.py:1052] saving pip packages
I0310 09:03:34.327748 140661933086464 run_manager.py:1054] initializing streaming files api
I0310 09:03:34.331110 140661933086464 run_manager.py:1061] unblocking file change observer, beginning sync with W&B servers


HBox(children=(IntProgress(value=0, description='Current iteration', max=191, style=ProgressStyle(description_…

Running loss: 0.754046

I0310 09:03:34.869752 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/config.yaml
I0310 09:03:35.025761 140662192207616 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json
I0310 09:03:35.026523 140662192207616 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-history.jsonl
I0310 09:03:35.027258 140662192207616 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-events.jsonl
I0310 09:03:35.027866 140662192207616 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-summary.json
I0310 09:03:35.028478 140662192207616 run_manager.py:677] file/dir created: /home/pr

Running loss: 0.369981

I0310 09:03:50.873364 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


Running loss: 0.399023

I0310 09:04:02.876613 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-events.jsonl


Running loss: 0.410467

I0310 09:04:06.877681 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


Running loss: 0.525971

I0310 09:04:12.879320 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-summary.json
I0310 09:04:12.880273 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-history.jsonl
I0310 09:04:18.302709 140666048968448 configuration_utils.py:118] Configuration saved in outputs/best_model/config.json
I0310 09:04:18.609557 140666048968448 modeling_utils.py:298] Model weights saved in outputs/best_model/pytorch_model.bin
I0310 09:04:18.880850 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-summary.json
I0310 09:04:18.881699 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-history.jsonl


Running loss: 0.408077

I0310 09:04:22.881886 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


Running loss: 0.573104

I0310 09:04:32.884463 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-events.jsonl


Running loss: 0.356302

I0310 09:04:38.886095 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


Running loss: 0.284750

I0310 09:04:51.961259 140666048968448 configuration_utils.py:118] Configuration saved in outputs/checkpoint-191-epoch-1/config.json


Running loss: 0.370863

I0310 09:04:52.243735 140666048968448 modeling_utils.py:298] Model weights saved in outputs/checkpoint-191-epoch-1/pytorch_model.bin
I0310 09:04:54.946515 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json
I0310 09:04:58.403270 140666048968448 configuration_utils.py:118] Configuration saved in outputs/best_model/config.json
I0310 09:04:58.709264 140666048968448 modeling_utils.py:298] Model weights saved in outputs/best_model/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Current iteration', max=191, style=ProgressStyle(description_…

Running loss: 0.326816

I0310 09:05:02.950666 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-summary.json
I0310 09:05:02.970115 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-events.jsonl
I0310 09:05:02.977103 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-history.jsonl
I0310 09:05:08.951627 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-summary.json
I0310 09:05:08.952592 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-history.jsonl


Running loss: 0.262979

I0310 09:05:10.952171 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


Running loss: 0.271975

I0310 09:05:26.956253 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


Running loss: 0.275703

I0310 09:05:33.958224 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-events.jsonl


Running loss: 0.317784

I0310 09:05:42.960611 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


Running loss: 0.319234

I0310 09:05:45.965392 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-summary.json
I0310 09:05:45.972975 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-history.jsonl


Running loss: 0.319138

I0310 09:05:51.965923 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-summary.json
I0310 09:05:51.966868 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-history.jsonl


Running loss: 0.311203

I0310 09:05:58.968019 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


Running loss: 0.386319

I0310 09:06:03.969316 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-events.jsonl


Running loss: 0.301595

I0310 09:06:14.972288 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


Running loss: 0.291711

I0310 09:06:21.698886 140666048968448 configuration_utils.py:118] Configuration saved in outputs/checkpoint-382-epoch-2/config.json


Running loss: 0.091935

I0310 09:06:21.996402 140666048968448 modeling_utils.py:298] Model weights saved in outputs/checkpoint-382-epoch-2/pytorch_model.bin
I0310 09:06:28.293530 140666048968448 configuration_utils.py:118] Configuration saved in outputs/config.json
I0310 09:06:28.591836 140666048968448 modeling_utils.py:298] Model weights saved in outputs/pytorch_model.bin
I0310 09:06:28.687106 140666048968448 run_manager.py:1068] shutting down system stats and metadata service


Training of roberta model complete. Saved to outputs/.


I0310 09:06:28.976661 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-events.jsonl
I0310 09:06:29.238617 140666048968448 run_manager.py:1080] stopping streaming files and file change observer
I0310 09:06:29.977487 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_130333-1301xvu0/wandb-metadata.json


#### Evaluate the model

In [45]:
result, model_outputs, wrong_predictions = model_Roberta.eval_model(eval_df_clean, acc=sklearn.metrics.accuracy_score)

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=1523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

{'mcc': 0.6223595391558661, 'tp': 459, 'tn': 784, 'fp': 90, 'fn': 190, 'eval_loss': 0.45775141566991806, 'acc': 0.8161523309258043}


### ALBERT Model training

In [46]:
model_albert = ClassificationModel('albert', 'albert-base-v2', num_labels=2, use_cuda=True, cuda_device=0, args=train_args)

I0310 09:10:21.979554 140666048968448 filelock.py:274] Lock 140662731518696 acquired on /home/priya/.cache/torch/transformers/0bbb1531ce82f042a813219ffeed7a1fa1f44cd8f78a652c47fc5311e0d40231.49ede2f5cbd21a453ab03ed1214f9068f024910f34b5023577f3d0068326f7b0.lock
I0310 09:10:21.983073 140666048968448 file_utils.py:479] https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json not found in cache or force_download set to True, downloading to /home/priya/.cache/torch/transformers/tmpcxjmcovj


HBox(children=(IntProgress(value=0, description='Downloading', max=534, style=ProgressStyle(description_width=…

I0310 09:10:22.234151 140666048968448 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json in cache at /home/priya/.cache/torch/transformers/0bbb1531ce82f042a813219ffeed7a1fa1f44cd8f78a652c47fc5311e0d40231.49ede2f5cbd21a453ab03ed1214f9068f024910f34b5023577f3d0068326f7b0
I0310 09:10:22.234591 140666048968448 file_utils.py:492] creating metadata file for /home/priya/.cache/torch/transformers/0bbb1531ce82f042a813219ffeed7a1fa1f44cd8f78a652c47fc5311e0d40231.49ede2f5cbd21a453ab03ed1214f9068f024910f34b5023577f3d0068326f7b0
I0310 09:10:22.235194 140666048968448 filelock.py:318] Lock 140662731518696 released on /home/priya/.cache/torch/transformers/0bbb1531ce82f042a813219ffeed7a1fa1f44cd8f78a652c47fc5311e0d40231.49ede2f5cbd21a453ab03ed1214f9068f024910f34b5023577f3d0068326f7b0.lock
I0310 09:10:22.235548 140666048968448 configuration_utils.py:256] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-con

HBox(children=(IntProgress(value=0, description='Downloading', max=47376696, style=ProgressStyle(description_w…

I0310 09:10:25.050044 140666048968448 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin in cache at /home/priya/.cache/torch/transformers/a175de1d3c60bba6e74bd034c02a34d909d9f36a0cf472b02301c8790ba44834.ab806923413c2af99835e13fdbb6014b24af86b0de8edc2d71ef5c646fc54f24
I0310 09:10:25.050722 140666048968448 file_utils.py:492] creating metadata file for /home/priya/.cache/torch/transformers/a175de1d3c60bba6e74bd034c02a34d909d9f36a0cf472b02301c8790ba44834.ab806923413c2af99835e13fdbb6014b24af86b0de8edc2d71ef5c646fc54f24
I0310 09:10:25.051204 140666048968448 filelock.py:318] Lock 140662629536600 released on /home/priya/.cache/torch/transformers/a175de1d3c60bba6e74bd034c02a34d909d9f36a0cf472b02301c8790ba44834.ab806923413c2af99835e13fdbb6014b24af86b0de8edc2d71ef5c646fc54f24.lock
I0310 09:10:25.051642 140666048968448 modeling_utils.py:461] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_

HBox(children=(IntProgress(value=0, description='Downloading', max=760289, style=ProgressStyle(description_wid…

I0310 09:10:26.311834 140666048968448 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model in cache at /home/priya/.cache/torch/transformers/dd1588b85b6fdce1320e224d29ad062e97588e17326b9d05a0b29ee84b8f5f93.c81d4deb77aec08ce575b7a39a989a79dd54f321bfb82c2b54dd35f52f8182cf
I0310 09:10:26.312609 140666048968448 file_utils.py:492] creating metadata file for /home/priya/.cache/torch/transformers/dd1588b85b6fdce1320e224d29ad062e97588e17326b9d05a0b29ee84b8f5f93.c81d4deb77aec08ce575b7a39a989a79dd54f321bfb82c2b54dd35f52f8182cf
I0310 09:10:26.313185 140666048968448 filelock.py:318] Lock 140662629535816 released on /home/priya/.cache/torch/transformers/dd1588b85b6fdce1320e224d29ad062e97588e17326b9d05a0b29ee84b8f5f93.c81d4deb77aec08ce575b7a39a989a79dd54f321bfb82c2b54dd35f52f8182cf.lock
I0310 09:10:26.313593 140666048968448 tokenization_utils.py:501] loading file https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model from

In [47]:
model_albert.train_model(train_df_clean, eval_df=eval_df_clean)

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=6090), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

I0310 09:10:46.049408 140666048968448 run_manager.py:924] system metrics and metadata threads started
I0310 09:10:46.051529 140666048968448 run_manager.py:933] checking resume status, waiting at most 10 seconds
I0310 09:10:46.279228 140666048968448 run_manager.py:951] resuming run from id: UnVuOnYxOjgwdWpjNmVkOnZpc3VhbGl6YXRpb24tZGVtbzpwZHdpdmVkaQ==
I0310 09:10:46.318503 140666048968448 run_manager.py:963] upserting run before process can begin, waiting at most 10 seconds
I0310 09:10:46.540079 140661856663296 run_manager.py:1048] saving patches
I0310 09:10:46.541958 140661856663296 run_manager.py:1052] saving pip packages
I0310 09:10:46.545370 140661856663296 run_manager.py:1054] initializing streaming files api
I0310 09:10:46.548172 140661856663296 run_manager.py:1061] unblocking file change observer, beginning sync with W&B servers


HBox(children=(IntProgress(value=0, description='Current iteration', max=191, style=ProgressStyle(description_…

I0310 09:10:46.864837 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/config.yaml


Running loss: 0.685512

I0310 09:10:47.020050 140662192207616 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-history.jsonl
I0310 09:10:47.020910 140662192207616 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-events.jsonl
I0310 09:10:47.022298 140662192207616 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/requirements.txt
I0310 09:10:47.022899 140662192207616 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/media/graph/graph_0_summary_1a989aa0.graph.json
I0310 09:10:47.023462 140662192207616 run_manager.py:677] file/dir created: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-summary.json
I0310 09:10:47.024189 140662192207616 run_manager.py

Running loss: 0.630319

I0310 09:11:02.868142 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json


Running loss: 0.762322

I0310 09:11:14.872592 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-events.jsonl


Running loss: 0.639655

I0310 09:11:18.873706 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json


Running loss: 0.534089

I0310 09:11:21.874924 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-history.jsonl
I0310 09:11:21.876337 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-summary.json
I0310 09:11:28.121426 140666048968448 configuration_utils.py:118] Configuration saved in outputs/best_model/config.json
I0310 09:11:28.187113 140666048968448 modeling_utils.py:298] Model weights saved in outputs/best_model/pytorch_model.bin


Running loss: 0.640330

I0310 09:11:28.876506 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-history.jsonl
I0310 09:11:28.877196 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-summary.json


Running loss: 0.572716

I0310 09:11:34.878118 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json


Running loss: 0.528466

I0310 09:11:44.880729 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-events.jsonl


Running loss: 0.667771

I0310 09:11:50.882390 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json


Running loss: 0.533343

I0310 09:11:59.244931 140666048968448 configuration_utils.py:118] Configuration saved in outputs/checkpoint-191-epoch-1/config.json
I0310 09:11:59.305748 140666048968448 modeling_utils.py:298] Model weights saved in outputs/checkpoint-191-epoch-1/pytorch_model.bin


Running loss: 0.479151

I0310 09:12:05.581230 140666048968448 configuration_utils.py:118] Configuration saved in outputs/best_model/config.json
I0310 09:12:05.614296 140666048968448 modeling_utils.py:298] Model weights saved in outputs/best_model/pytorch_model.bin


HBox(children=(IntProgress(value=0, description='Current iteration', max=191, style=ProgressStyle(description_…

Running loss: 0.468530

I0310 09:12:06.886604 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json


Running loss: 0.490144

I0310 09:12:08.887451 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-history.jsonl
I0310 09:12:08.889034 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-summary.json
I0310 09:12:14.889071 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-events.jsonl


Running loss: 0.407895

I0310 09:12:15.889326 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-history.jsonl
I0310 09:12:15.889835 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-summary.json


Running loss: 0.386610

I0310 09:12:22.891403 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json


Running loss: 0.558779

I0310 09:12:38.895255 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json


Running loss: 0.581347

I0310 09:12:44.896793 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-events.jsonl


Running loss: 0.494742

I0310 09:12:49.898380 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-history.jsonl
I0310 09:12:49.899195 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-summary.json
I0310 09:12:54.899547 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json
I0310 09:12:55.652584 140666048968448 configuration_utils.py:118] Configuration saved in outputs/best_model/config.json
I0310 09:12:55.684015 140666048968448 modeling_utils.py:298] Model weights saved in outputs/best_model/pytorch_model.bin
I0310 09:12:55.899852 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-history.jsonl
I0310 09:12:55.9

Running loss: 0.468477

I0310 09:13:10.905192 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json


Running loss: 0.547322

I0310 09:13:15.906450 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-events.jsonl


Running loss: 0.492901

I0310 09:13:23.634442 140666048968448 configuration_utils.py:118] Configuration saved in outputs/checkpoint-382-epoch-2/config.json
I0310 09:13:23.694975 140666048968448 modeling_utils.py:298] Model weights saved in outputs/checkpoint-382-epoch-2/pytorch_model.bin


Running loss: 0.522778

I0310 09:13:26.909488 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json
I0310 09:13:30.022613 140666048968448 configuration_utils.py:118] Configuration saved in outputs/best_model/config.json
I0310 09:13:30.060360 140666048968448 modeling_utils.py:298] Model weights saved in outputs/best_model/pytorch_model.bin
I0310 09:13:30.064007 140666048968448 configuration_utils.py:118] Configuration saved in outputs/config.json
I0310 09:13:30.125827 140666048968448 modeling_utils.py:298] Model weights saved in outputs/pytorch_model.bin
I0310 09:13:30.128537 140666048968448 run_manager.py:1068] shutting down system stats and metadata service


Training of albert model complete. Saved to outputs/.


I0310 09:13:30.911246 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-events.jsonl
I0310 09:13:31.232758 140666048968448 run_manager.py:1080] stopping streaming files and file change observer
I0310 09:13:31.912103 140662192207616 run_manager.py:688] file/dir modified: /home/priya/Documents/AI_Apps/simpletransformers_blog/wandb/run-20200310_131045-80ujc6ed/wandb-metadata.json


In [48]:
result, model_outputs, wrong_predictions = model_albert.eval_model(eval_df_clean, acc=sklearn.metrics.accuracy_score)

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=1523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

{'mcc': 0.5121674642936882, 'tp': 433, 'tn': 730, 'fp': 144, 'fn': 216, 'eval_loss': 0.5123823285102844, 'acc': 0.7636244254760342}


### Perform prediction - Test Set

In [53]:
test_df = pd.read_csv("./test.csv")

In [54]:
## Drop columns for keyword and location
columns_todrop = ['keyword','location']
test_df.drop(columns=columns_todrop,inplace=True)
### Change contractions
test_df['text']=test_df['text'].apply(remove_contractions)
## Clean Data set
test_df['text'] =test_df['text'].apply(clean_dataset)
test_df.head()

Unnamed: 0,id,text
0,0,happened terrible crash
1,2,Heard about earthquake different cities everyone
2,3,there forest geese fleeing across street cannot
3,9,Apocalypse lighting Spokane wildfires
4,11,Typhoon Soudelor kills China Taiwan


In [55]:
predictions, raw_outputs = model_Roberta.predict(test_df['text'])

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=3263), HTML(value='')))

HBox(children=(IntProgress(value=0, max=51), HTML(value='')))

In [58]:
test_df['target']=predictions
test_df.tail()

Unnamed: 0,id,text,target
3258,10861,EARTHQUAKE SAFETY ANGELES SAFETY FASTENERS,0
3259,10865,Storm worse hurricane city3others hardest look...,1
3260,10868,Green derailment Chicago,1
3261,10874,issues Hazardous Weather Outlook,1
3262,10875,CityofCalgary activated Municipal Emergency yy...,1


In [57]:
test_df['target'].value_counts()

0    2132
1    1131
Name: target, dtype: int64

### Perform predictions on random tweets

In [67]:
test_tweet1 = "#COVID19 will spread across U.S. in coming weeks. We’ll get past it, but must focus on limiting the epidemic, and preserving lif"
test_tweet1 = remove_contractions(test_tweet1)
test_tweet1 = clean_dataset(test_tweet1)

'COVID19 spread across coming weeks focus limiting epidemic preserving'

In [68]:
predictions, _ = model_Roberta.predict([test_tweet1])
response_dict = {0: 'Fake', 1: 'Real'}
print("Prediction is: ", response_dict[predictions[0]])

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Prediction is:  Real


In [70]:
test_tweet2 = "BREAKING: Confirmed flooding on NYSE. The trading floor is flooded under more than 3 feet of water."
test_tweet2 = remove_contractions(test_tweet2)
test_tweet2 = clean_dataset(test_tweet2)

'BREAKING Confirmed flooding trading floor flooded under water'

In [71]:
predictions, _ = model_Roberta.predict([test_tweet2])
response_dict = {0: 'Fake', 1: 'Real'}
print("Prediction is: ", response_dict[predictions[0]])

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Prediction is:  Real


In [80]:
test_tweet3 = "Everything is ABLAZE. Please run!!"
test_tweet3 = remove_contractions(test_tweet3)
test_tweet3 = clean_dataset(test_tweet3)

'Everything ABLAZE Please'

In [81]:
predictions, _ = model_Roberta.predict([test_tweet3])
response_dict = {0: 'Fake', 1: 'Real'}
print("Prediction is: ", response_dict[predictions[0]])

Converting to features started. Cache is not used.


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

Prediction is:  Fake
