In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
import gc
import time
import re
import torch
import transformers
from datasets import Dataset
from transformers import Trainer, BertForSequenceClassification, BertTokenizer

gc.collect()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("checkpoint-15546")
trainer = Trainer(model=model,
                tokenizer=tokenizer)

In [15]:
print(transformers.__version__)

4.3.3


In [16]:
def predict_fast(text, model, tokenizer):  
    #tokenize the text
    start = time.time()
    if len(text.split()) >= 450:
        batches_no = len(text.split()) // 450 + 1
        for i in range(batches_no):
            text = ' '.join(text.split()[(i*450):(i+1)*450])
            encodings = tokenizer(text, 
                          max_length=512, 
                          truncation=True, 
                          padding=True,
                         return_tensors="pt")
            outputs = model(**encodings)
            preds = outputs.logits
            if int(preds.argmax(-1).numpy()[0]) == 1:
                print(1)
                print(time.time() - start)
                return 1, float(torch.nn.functional.softmax(preds, dim=1).detach().numpy()[0][1])
        print(0)
        print(time.time() - start)
        return 0, float(torch.nn.functional.softmax(preds, dim=1).detach().numpy()[0][1])
    else:
        encodings = tokenizer(text, 
                              max_length=512, 
                              truncation=True, 
                              padding=True,
                             return_tensors="pt")
        outputs = model(**encodings)
        preds = outputs.logits
        print(int(preds.argmax(-1).numpy()[0]))
        print(time.time() - start)
        return int(preds.argmax(-1).numpy()[0]), float(torch.nn.functional.softmax(preds, dim=1).detach().numpy()[0][1])

In [17]:
def basic_preprocess(text):
    text = re.sub(r'http\S+', ' ', text)
    return text

def predict_trainer(text):
    start = time.time()
    dataset = pd.DataFrame({'text': [basic_preprocess(text)]})
    custom_dataset = Dataset.from_pandas(dataset)
    tokenized_text = custom_dataset.map(lambda x: tokenizer(x['text'], truncation=True, max_length = 512), batched=True)
    tokenized_text = tokenized_text.remove_columns(['text'])

    trainer = Trainer(model=model,
                    tokenizer=tokenizer)
    output = np.argmax(trainer.predict(tokenized_text).predictions, axis=-1)
    print(output)
    print(time.time() - start)
    return output

def get_prediction1(values):
    start = time.time()
    values = basic_preprocess(values)
    length_of_text = len(values.split())
    result = []
    if length_of_text >= 450:
        decisions = []
        scores = []
        batches_no = length_of_text // 450 + 1
        for i in range(batches_no):
            text = ' '.join(values.split()[(i*450):(i+1)*450])
            dataset = pd.DataFrame({'text': [text]})
            custom_dataset = Dataset.from_pandas(dataset)
            tokenized_text = custom_dataset.map(lambda x: tokenizer(x['text'],truncation=True, padding="max_length",  max_length = 512), batched=True)
            tokenized_text = tokenized_text.remove_columns(['text'])
            trainer = Trainer(model=model,
                            tokenizer=tokenizer)
            dec = np.argmax(trainer.predict(tokenized_text).predictions, axis=-1)[0]
            decisions.append(dec)
            scores.append(tf.math.softmax(trainer.predict(tokenized_text).predictions, axis=-1).numpy()[0][dec])
        if 1 in decisions:
            decision = 1
            score = scores[decisions.index(1)]
        else:
            decision = 0
            score = scores[decisions.index(0)]
    else:
        dataset = pd.DataFrame({'text': [values]})
        custom_dataset = Dataset.from_pandas(dataset)
        tokenized_text = custom_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding="max_length", max_length = 512), batched=True)
        tokenized_text = tokenized_text.remove_columns(['text'])

        trainer = Trainer(model=model,
                        tokenizer=tokenizer)

        decision = np.argmax(trainer.predict(tokenized_text).predictions, axis=-1)[0]
        score = tf.math.softmax(trainer.predict(tokenized_text).predictions, axis=-1).numpy()[0][decision]
    print(decision)
    print(time.time() - start)
    return decision, score

def get_prediction2(values):
    start = time.time()
    values = basic_preprocess(values)
    length_of_text = len(values.split())
    if length_of_text >= 450:
        batches_no = length_of_text // 450 + 1
        for i in range(batches_no):
            text = ' '.join(values.split()[(i * 450):(i + 1) * 450])
            dataset = pd.DataFrame({'text': [text]})
            custom_dataset = Dataset.from_pandas(dataset)
            tokenized_text = custom_dataset.map(
                lambda x: tokenizer(x['text'], truncation=True, padding="max_length", max_length=512),
                batched=True)
            tokenized_text = tokenized_text.remove_columns(['text'])
            dec = np.argmax(trainer.predict(tokenized_text).predictions, axis=-1)[0]
            if int(dec) == 1:
                print(1)
                print(time.time() - start)
                return int(dec), float(
                    tf.math.softmax(trainer.predict(tokenized_text).predictions, axis=-1).numpy()[0][dec])
        print(0)
        print(time.time() - start)
        return int(dec), float(tf.math.softmax(trainer.predict(tokenized_text).predictions, axis=-1).numpy()[0][dec])
    else:
        dataset = pd.DataFrame({'text': [values]})
        custom_dataset = Dataset.from_pandas(dataset)
        tokenized_text = custom_dataset.map(
            lambda x: tokenizer(x['text'], truncation=True, padding="max_length", max_length=512), batched=True)
        tokenized_text = tokenized_text.remove_columns(['text'])

        decision = np.argmax(trainer.predict(tokenized_text).predictions, axis=-1)[0]
        score = tf.math.softmax(trainer.predict(tokenized_text).predictions, axis=-1).numpy()[0][decision]
        print(decision)
        print(time.time() - start)
        return int(decision), float(score)

In [18]:
values = ["Thank you, I also wish you the best too. That sounds like a great idea. I am willing to try anything. From memory my mum has a few oils, will give her a buzz and borrow some. I'm willing to try anything. I do breathing exercises which helps anxiety a bit but my depression isn't so easy today. Thank you again for your response, I really appreciate it. Feel like I'm a record on repeat, seems to happened every 6 months or so that depression gets the best of me. Was close to deleting this as I hadn't had responses. Thank you again.",
          "This has helped me a lot today, Thank you. I was diagnosed with anxiety and depression about 15 years ago. Today isn't a good day for me, one of the many thoughts in my head is that I should 'suck it up' and 'other people have it worse'. I know it won't last and then I'll feel silly/stupid for my mood. Want to give a shout out to my partner and family. They have gone through it all with me. Everyone deserves to be heard",
          "I have a feeling she is going to be getting a lot of attention today. Lots of hugs just to start with. Could watch her sleep all day. Thank you.",
          "Old photo found on my phone, our girl when she was a puppy. First day home.",
          "I have paranoia and depression. I have anxiety and I hate this life",
          "I go to therapy. I have paranoia and go to psychologist",
          "Give that wolf a banana",
          "I like Eurovision, but I am hurting myself and biting my nails because of frustration",
          "I am in the process of coming out of a two-year long battle with depression. This was extremely helpful to get because it made me realize the steps I am taking are visible.",
          "I was in a dark place and needed reassurance from my significant other that I was loved and wanted, and I then apologized for wanting and needing that. This was his response.",
          "I had an early morning panic attack and texted my friend at 3:30 a.m… she was beyond awesome and helped me out even though I woke her up",
          "Ukraine",
          "Stefania mama stefanie"]
for value in values:
    predict_fast(value, model, tokenizer)
    get_prediction1(value)
    get_prediction2(value)

1
0.382312536239624


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.090200901031494


  0%|          | 0/1 [00:00<?, ?ba/s]

1
5.906810283660889
1
0.3625314235687256


  0%|          | 0/1 [00:00<?, ?ba/s]

1
5.698418617248535


  0%|          | 0/1 [00:00<?, ?ba/s]

1
5.930055141448975
0
0.1484241485595703


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.182364463806152


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.163384437561035
0
0.10011172294616699


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.145933628082275


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.158972263336182
1
0.09308671951293945


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.377417087554932


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.220586776733398
1
0.10247182846069336


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.250955104827881


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.314693927764893
0
0.07953453063964844


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.384036540985107


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.128257513046265
1
0.0968019962310791


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.063665390014648


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.191826343536377
1
0.1441805362701416


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.354737997055054


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.147123336791992
0
0.1417405605316162


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.096701383590698


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.110196590423584
1
0.12630200386047363


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.437778472900391


  0%|          | 0/1 [00:00<?, ?ba/s]

1
6.36743950843811
0
0.08647418022155762


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.037394762039185


  0%|          | 0/1 [00:00<?, ?ba/s]

0
5.91011381149292
0
0.0801699161529541


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.316625356674194


  0%|          | 0/1 [00:00<?, ?ba/s]

0
6.070949554443359


In [5]:
values = ["         Let's get real: they are not gonna add them. They already said they don't like game ending/changing killstreaks :(",
          "         (A bit long for ELI5, so apologies) Each citizen needs 2 food to survive. If you have food yields (from buildings, working food tiles, or trade routes) greater than (2)(Population), the extra food gets added to your food basket each turn. Once this basket is filled, you get an extra citizen! This citizen can now work a new tile, giving you more food/production/gold, and the cycle continues. After each population growth, the food basket is empty and also gets a little bit bigger, which is why your cities grow much faster when small, and much slower when large. This also explains why some policies/buildings improve +growth, +excess food, or +food leftover (these are all different concepts). Your production is directly tied to your citizens (who work production tiles or act as specialists) and your buildings. If you have 1 citizen, and he works a mine for 3 production, each turn you can produce 3 hammers. If a building costs 30 hammers, it will take 10 turns. If your population grows to 2, and now you can work 2 mines, you produce 6 hammers per turn, and your building is now built in 5 turns. As you can easily see, the more citizens you have to provide production (either by working tiles or as specialists), the greater your production each turn is, and the faster you can build buildings. The actual formulas vary greatly based on the tiles you can work, the improvements your workers make, and the buildings in the city. An easy way to understand this mechanic is, when your city first grows to Population 2, open up your city management screen (by clicking on the city). Then click Citizen Management in the upper right corner, and move your citizen around to different tiles. You will see direct changes to however long the building in your production queue has left, and you will see changes to 'Turns to Next Citizen.' Now take what you know for how 1 citizen moved around changes this, and multiply it by your entire populace for later in the game. Each game has a trade-off decision you have to make. Typically 'production-focused' tiles like mines will help you build early buildings much faster, but your cities will grow slower (and thus you have fewer citizens to work tiles). Alternatively, you can work farms to grow your city quickly, but your buildings will take much longer to build (until you have enough citizens to work production tiles in addition to food tiles). Since population drives several other key resources (like gold from trade routes and science), you generally want to do everything possible to get your cities as large as your happiness allows. Typically your production keeps up as your cities grow (assuming you settled near a few hills or forests), but you may want to manually adjust your citizens every so often to make sure your production isn't too low. Alternatively, if your cities aren't growing (and you want them to), you may need to re-assign citizens to food tiles at the expense of production.",
    "    Update 3:18 AM, Big Ten logo is getting blued out     As title said, Big Ten logo is getting fucked up. See if you can do anything to help. Also, we're trying to build up a megathread at UIUC, found [here](https://www.reddit.com/r/UIUC/comments/6354dh/megathread_for_rplace/) if you want to help out",
         "    blender with rust source bindings     is it possible to develop addons in rustlang? and intergrate it with blender via python?",
         "    A quick question about the physics engine.     I searched around a bit, and wasn't able to find an answer to this question, though maybe I'm just bad at wording things in search engines. A while back, presumably after a past update, my physics started to get a bit weird. Everything seemed to lose most of its mass, to the point that I'm beginning to suspect that Appalachia is actually on Mars. Ragdolls, thrown grenades and mines, and basically anything else physics driven has become incredibly floaty, which wouldn't be a big deal other than that it messes up the timing and tragectory of my grenades. Is this a problem for everyone or is it just me? Is this Bethesda's problem or is there something I can do about it? For reference I'm playing on Xbox, and am up to date with all the latest updates.",
         "         World to me is scary. Expectations from parents and society are making me feel like there is no meaning to life. I am disconnected with everything with zero interests. Maybe it's quarter life crisis maybe depression. I am in a great conflict with stoicism and life in general.",
         "    Deja fucking vu     Didnt play poker for a week. Started to feel pretty good. Then I had the brilliant idea bc I got a little money to deposit $800 and play. Well I lost that in an hour and really didnt have a chance In hell to win. 2 outers, runner runner and a few other hands and there goes that. Im sick again. I dont want to have this feeling ever again. Im so done with this lifestyle and insanity. Its pure fucking evil and has to stop. There is not one thing good that comes from gambling. It makes me miserable, stressed, isolated, broke, scared, depressed, sad, angry. Its a brutal addiction and has really fucked my life up good. Im so fucking pissed at myself that I lost that money when money is so tight and Im barely getting by. -800$ in an hour. Really? Pure fucking stupidity. The whole time Im playing I know Im gonna lose bc its destiny I lose and lose and lose and lose and lose and lose some more so I get in enough pain to stop. When I stop for good thats when I finally win."]

def basic_preprocess(text):
    text = re.sub(r'http\S+', ' ', text)
    return text

dataset = pd.DataFrame({'text': [basic_preprocess(text) for text in values]})
custom_dataset = Dataset.from_pandas(dataset)
tokenized_text = custom_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding="max_length", max_length = 512), batched=True)
tokenized_text = tokenized_text.remove_columns(['text'])

trainer = Trainer(model=model,
                tokenizer=tokenizer)
    
print(np.argmax(trainer.predict(tokenized_text).predictions, axis=-1))

  0%|          | 0/1 [00:00<?, ?ba/s]

[0 0 0 0 0 1 0]


### Get prediction method

In [147]:
def get_prediction(values):
    values = basic_preprocess(values)
    length_of_text = len(values.split())
    result = []
    if length_of_text >= 400:
        decisions = []
        scores = []
        batches_no = length_of_text // 400 + 1
        for i in range(batches_no):
            text = ' '.join(values.split()[(i*400):(i+1)*400])
            dataset = pd.DataFrame({'text': [text]})
            custom_dataset = Dataset.from_pandas(dataset)
            tokenized_text = custom_dataset.map(lambda x: tokenizer(x['text'], max_length = 512), batched=True)
            tokenized_text = tokenized_text.remove_columns(['text'])
            trainer = Trainer(model=model,
                            tokenizer=tokenizer)
            dec = np.argmax(trainer.predict(tokenized_text).predictions, axis=-1)[0]
            decisions.append(dec)
            scores.append(tf.math.softmax(trainer.predict(tokenized_text).predictions, axis=-1).numpy()[0][dec])
        if 1 in decisions:
            decision = 1
            score = scores[decisions.index(1)]
        else:
            decision = 0
            score = scores[decisions.index(0)]
    else:
        dataset = pd.DataFrame({'text': [values]})
        custom_dataset = Dataset.from_pandas(dataset)
        tokenized_text = custom_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding="max_length", max_length = 512), batched=True)
        tokenized_text = tokenized_text.remove_columns(['text'])

        trainer = Trainer(model=model,
                        tokenizer=tokenizer)

        decision = np.argmax(trainer.predict(tokenized_text).predictions, axis=-1)[0]
        score = tf.math.softmax(trainer.predict(tokenized_text).predictions, axis=-1).numpy()[0][decision]
    return decision, score

In [148]:
data_round1 = [{"redditor": 338, "content": "    Getting Sober Didn't Lift my Depression. (18 months sober", "date": "2014-12-12T04:21:13.000+0000", "id": 168996, "title": "    Copy the Reindeer", "number": 1, "nick": "subject8081"},
    {"redditor": 339, "content": "    This post is all over the place, but I felt like sharing my story others can relate and no that they really are not alone! Before I start: Please don't get me wrong and think I come from some rich family and my parents are Doctors and giving me thousands in allowance, that is now true. I work hard for every dollar I make! I am thankful that I found and still have a very well paying job and it will take me a year to get back to where I was when I started gambling, but the pain is strong just like someone who lost it all with much less saved up. I just turned 21 and I am just hit rock bottom, or so I thought I did. Cash Advance - the worse feature of a credit card for a gambler. You see I thought once I hit $0 in the bank last week that would be all, I walked out of the casino in great spirits and ready to start a new chapter! Most people would be ready to kill themselves but I was the opposite, I was so happy and felt like some kind of burden was taken off my shoulders and as I was driving home I felt like a million bucks, even though I had $0 to my name! This high lasted a few days until I got paid and that urge came back to go, which I did....I went with $4,000 and at what point was up $8,000! Of course as we all know ''the house always wins'' and I ended up losing all of that $4,000 and then proceeded to take out $3,000 in cash advance...as you know I lost that , too. Before gambling I had saved over $60,000 was living fine , life was moving. I got a nice car, which thankfully I still have. Money was just a number like the numbers on the clock. I was thinking of moving out, getting a nice place that would cost $3,000 a month, get nice furniture have a game room, you get the point, living the good life. I was always looking for more ways to make money and gambling never crossed my mind. I have friends who always asked me to go to the casino with them, but it never interested me for some odd reason. One day I was bored and in the area of the casino.... A big mistake I made that day: I walked into the casino, something I wish to this day I would have never done. But what can a $500 loss do already? I HAD so much saved up! As you can imagine, I went in and lost that $500, what repeated after was relatable to anyone who is struggling with gambling, I wasted over $2k that day but that didn't phase me at all, I left and went to my hotel. Next morning comes and I just have this crazy urge to go back I give in go back and lose another $2k. Now for about a week I was fine as I was so occupied with work after vacation that i didn't even remember #160;about the losses but then the following weekend I went again and lost $5k in a single night. Now keep in mind you generally do not lose $1k at one time, it is over the course of a few machines. Going once a week led to going 5-7 times a week losing $500-$1k each time! I did enjoy myself there meeting a whole bunch of interesting characters that wasted even more money than I have but as you leave reality sets back in to place. There is no concept of money once you step food into a casino, any sense of value is outside those casino doors. I refused to buy myself a pair of shoes because they were $5 more than last time, yet I go to the casino and gamble 10x that! Around the time I started going to the casino I was in a relationship, things were rocky and we ended up breaking it up. I was devastated and shifted my focus to the casino. The timing of me going to the casino to which I had never gone before, and breaking up a month later worsened the situation a lot. I am not one to blame anyone or thing but myself! I am grown, and make my own decisions, I chose to let it get out of control and now I am suffering from this depression. I used to be the one helping other and now I have to now borrow $400 to pay a bill off that way I do not get charge interest for a late fee, how the wheel has turned. It makes you feel dead inside, you lose all life to you. Depression comes and goes, but when it is here, it hits hard! My biggest issue is that I am afraid of myself, I lose total control and go crazy! I have lost 4k in a matter of 2 hours! That is insane! I have told myself so many time that I am done, but I kept going back. At this point I have to pay off the 3k for the Cash Advance I took out yesterday. Learning the value of money is hard. I have trouble falling asleep, I wake up sad. I push through the day, sad and grumpy, and get my work done. I am just existing at this point, the spark has left and is waiting to be reignited. It will take 2-3 months for me to look at my bank and smile again, but I did this to myself and am now paying the consequence. I feel like writing this out helps me a bit and I truly hope someone will see this and know we can get through this. Has anyone else hit rock bottom and felt great, then a few days later it hit you like a ton of bricks? Any advice for me? Thanks in advance!", "date": "2013-10-10T13:17:01.000+0000", "id": 169297, "title": "", "number": 1, "nick": "subject2621"},
    {"redditor": 340, "content": "    I have a question about being a visitor in Nioh(Random encounters)", "date": "2017-05-09T17:01:50.000+0000", "id": 169531, "title": "    Nioh - Become a visitor", "number": 1, "nick": "subject992"}]

In [149]:
########### Should be modified

for user in data_round1:
    decision, score = get_prediction(user['content'])
    print(decision, score)


  0%|          | 0/1 [00:00<?, ?ba/s]

1 0.9821879


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

[0, 0, 1]
1 0.9976046


  0%|          | 0/1 [00:00<?, ?ba/s]

0 0.9991805
