In [17]:
import fasttext
import pandas as pd
import swifter

  import pandas.util.testing as tm


In [17]:
senti_train = pd.read_csv("./sentiment_dataset/sst2_train_80.csv", header=0)
senti_test = pd.read_csv("./sentiment_dataset/sst2_eval.csv", header=0)

In [18]:
senti_train.head()

Unnamed: 0,labels,text
0,1,are until the film is well under way and yet i...
1,0,every cliche in the war movie compendium acros...
2,1,yields surprises
3,0,the movie has a script by paul pender made of ...
4,0,in the wrong hands ie peploe s it s simply unb...


In [19]:
senti_test.head()

Unnamed: 0,labels,text
0,1,what s next
1,1,to amuse even the most resolutely unreligious ...
2,1,take any yearold boy to see this picture
3,1,is such highwattage brainpower coupled with p...
4,1,are an immensely appealing couple


In [26]:
senti_train.to_csv("./sentiment_dataset/senti_train.txt", sep=' ', index=False, header=False)
senti_test.to_csv("./sentiment_dataset/senti_test.txt", sep=' ', index=False, header=False)

In [27]:
senti_model = fasttext.train_supervised(input="./sentiment_dataset/senti_train.txt", autotuneValidationFile='./sentiment_dataset/senti_test.txt')

In [28]:
senti_model.test("./sentiment_dataset/senti_test.txt", k=1)

(13179, 0.9183549586463313, 0.9183549586463313)

In [34]:
senti_model.save_model('sentiment.bin')

In [33]:
# how to use
senti_model.predict(['I love you!', 'I hate you!'], k=2)

([['__label__positive', '__label__negative'],
  ['__label__negative', '__label__positive']],
 [array([1.0000093e+00, 1.0700522e-05], dtype=float32),
  array([0.67083234, 0.32918763], dtype=float32)])

In [3]:
# load model
senti_model = fasttext.load_model('./sentiment.bin')

In [4]:
# how to use
senti_model.predict(['I love you!', 'I hate you!'], k=2)

([['__label__positive', '__label__negative'],
  ['__label__negative', '__label__positive']],
 [array([1.0000093e+00, 1.0700522e-05], dtype=float32),
  array([0.67083234, 0.32918763], dtype=float32)])

In [5]:
df_posts = pd.read_csv("./rb_data/all_posts.csv", parse_dates=['createdAt', 'updatedAt'], header=0)

In [12]:
# filter our missing data
df_posts = df_posts[df_posts['AuthorId'].notna()]
df_posts = df_posts[df_posts['text'].notna()]
df_posts['AuthorId'] = [int(id) for id in df_posts['AuthorId'].tolist()]

# pick the columns of our interests
sel_cols = ['text', 'tags', 'createdAt', 'updatedAt', 'AuthorId']
df_posts = df_posts[sel_cols]
df_posts.head()

Unnamed: 0,text,tags,createdAt,updatedAt,AuthorId
0,I think that we should move away from nuclear ...,"{nuclear,energy,enviroment}",2020-09-03 01:59:10.051000+00:00,2020-09-04 13:08:51.133000+00:00,2580
1,"Given the security put in place since 911, it ...",{TerroristAttack},2020-09-03 02:28:29.002000+00:00,2020-09-04 13:08:51.133000+00:00,2593
3,It is crucial that the United States plays a r...,{Worldaffairs},2020-09-03 02:34:07.312000+00:00,2020-09-04 13:08:51.133000+00:00,2593
4,Strongly disapprove of a two percent annual ta...,{WealthTax},2020-09-03 02:36:27.825000+00:00,2020-09-04 13:08:51.133000+00:00,2593
5,The U.S. government has gone too far in respon...,"{COVID,pandemic}",2020-09-03 02:40:41.623000+00:00,2020-09-04 13:08:51.133000+00:00,2679


In [15]:
def preprocess_text(text):
    import nltk
    import string
    import re
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    # Check characters to see if they are in punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    # remove URLs
    nopunc = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', nopunc)
    nopunc = re.sub(r'http\S+', '', nopunc)
    # remove usernames
    nopunc = re.sub('@[^\s]+', '', nopunc)
    # remove the # in #hashtag
    nopunc = re.sub(r'#([^\s]+)', r'\1', nopunc)
    nopunc = re.sub(r"[^a-zA-Z.,!?]+", r" ", nopunc)
    return nopunc

In [18]:
df_posts['norm_text'] = df_posts['text'].swifter.allow_dask_on_strings().apply(preprocess_text)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1057.0, style=ProgressStyle(descriptio…




In [20]:
df_posts['norm_text'].to_list()[:3]

['I think that we should move away from nuclear energy as an energy sourcethe harm that it does to the environment is damaging and we have to go towards a less harmful energy source',
 'Given the security put in place since it is unlikely that the United States will experience a major terrorist attack that will kill more than American citizens ',
 'It is crucial that the United States plays a role in world affairs moving forward It is best for our country ']

In [26]:
# use fasttext classifier to do prediction
results = senti_model.predict(df_posts['norm_text'].to_list(), k=1)

In [27]:
# results is a tuple, [0] is the predicted label, and [1] is the prediction score
results[0][:3]

[['__label__positive'], ['__label__negative'], ['__label__positive']]

In [29]:
# process the labels
predict_labels = [label[0].replace("__label__", "") for label in results[0]]
predict_labels[:3]

['positive', 'negative', 'positive']

In [31]:
# process the scores
predict_scores = [score[0] for score in results[1]]
predict_scores[:3]

[0.64712965, 0.70748234, 0.97706586]

In [32]:
df_posts['labels'] = predict_labels
df_posts['scores'] = predict_scores
df_posts.head(3)

Unnamed: 0,text,tags,createdAt,updatedAt,AuthorId,norm_text,labels,scores
0,I think that we should move away from nuclear ...,"{nuclear,energy,enviroment}",2020-09-03 01:59:10.051000+00:00,2020-09-04 13:08:51.133000+00:00,2580,I think that we should move away from nuclear ...,positive,0.64713
1,"Given the security put in place since 911, it ...",{TerroristAttack},2020-09-03 02:28:29.002000+00:00,2020-09-04 13:08:51.133000+00:00,2593,Given the security put in place since it is un...,negative,0.707482
3,It is crucial that the United States plays a r...,{Worldaffairs},2020-09-03 02:34:07.312000+00:00,2020-09-04 13:08:51.133000+00:00,2593,It is crucial that the United States plays a r...,positive,0.977066
