In [1]:
import pandas as pd
import fasttext
import preprocessor

#### load labeled data

In [2]:
y_df = pd.read_csv('tweets_labeled.csv', index_col='id', sep="|")

#### create text file (in fast text format) with labeled tweets

In [3]:
l_df = y_df[['tweet', 'label']]
l_df = l_df[l_df['label'].notnull()]
len(l_df)

1298

In [4]:
label_prefix = '__label__'
with open('labeled_tweets.txt', 'w') as file:
    for index, row in l_df.iterrows():
        file.write(label_prefix + row['label'] + ' ')
        file.write(row['tweet'].replace('\n', ' '))
        file.write('\n')

with open('l_ids.txt', 'w') as file:
    for index, row in l_df.iterrows():
        file.write(str(index))
        file.write('\n')

#### create text file with tweets without labels

In [5]:
with open('unlabeled_tweets.txt', 'w') as file:
    for index, row in y_df.iterrows():
        file.write(row['tweet'].replace('\n', ' '))
        file.write('\n')
        
with open('u_ids.txt', 'w') as file:
    for index, row in y_df.iterrows():
        file.write(str(index))
        file.write('\n')

#### train model on all data

In [6]:
 model = fasttext.train_supervised(input="labeled_tweets.txt", epoch=25, lr=1.0, wordNgrams=2)

#### get example unlabeled tweets and predict their types

In [7]:
tweet1 = y_df['tweet'].iloc[2000]
print(tweet1)
print()

tweet2 = y_df['tweet'].iloc[2001]
print(tweet2)
print()

tweet3 = y_df['tweet'].iloc[2002]
print(tweet3)
print()

tweet4 = y_df['tweet'].iloc[2003]
print(tweet4)
print()

#AlertMPK - #TRAM Brak przejazdu ul. Nowowiejska/Jedności Narodowej - wypadek bez udziału pojazdów MPK. Tramwaje linii 0L, 0P, 1 skierowano przez ul. Słowiańską, Jedności Narodowej, pl. Bema, Sienkiewicza, Piastowską w obu kierunkach. Uruchomiono autobusy "za tramwaj".

#AlertMPK ul. Olszewskiego - ruch przywrócony. Tramwaje wracają na swoje stałe trasy.

#AlertMPK - #TRAM Brak przejazdu ul. Olszewskiego/Kopernika - samochód na torowisku. Tramwaje linii 1, 2, 4, 10 skierowano objazdem w obu kierunkach przez ul. Mickiewicza do Sępolna.

#AlertMPK Brak przejazdu ul. Skargi w kieunku Dworca Głównego - awaria tramwaju.Tramwaje linii 2, 5, 8, 9, 11 skierowano objazdem.



In [8]:
model.predict(tweet1, k=5)

(('__label__accident',
  '__label__event',
  '__label__incident',
  '__label__renovation',
  '__label__unknown'),
 array([9.97058928e-01, 2.73103849e-03, 1.71738837e-04, 6.09329109e-05,
        2.57272513e-05]))

In [9]:
model.predict(tweet2, k=5)

(('__label__fix',
  '__label__unknown',
  '__label__change',
  '__label__malfunction',
  '__label__jam'),
 array([9.99745071e-01, 2.62494781e-04, 2.00289323e-05, 1.21320918e-05,
        1.02652139e-05]))

In [10]:
model.predict(tweet3, k=5)

(('__label__incident',
  '__label__accident',
  '__label__unknown',
  '__label__renovation',
  '__label__malfunction'),
 array([0.84509426, 0.1113437 , 0.02638631, 0.00989922, 0.00391092]))

In [11]:
model.predict(tweet4, k=5)

(('__label__malfunction',
  '__label__unknown',
  '__label__fix',
  '__label__incident',
  '__label__accident'),
 array([8.24357092e-01, 1.73900202e-01, 9.03649081e-04, 8.15173029e-04,
        4.44189282e-05]))

In [12]:
model.predict('#AlertMPK ul. Legnickiego - tramwaje znów ruszyły po swoich stałych trasach.', k=4)

(('__label__renovation',
  '__label__unknown',
  '__label__fix',
  '__label__change'),
 array([0.36902174, 0.19149745, 0.17977308, 0.12607729]))

#### divide data into train and test dataset and validate prediction

In [16]:
m = fasttext.train_supervised(input="tweets.train", epoch=25, lr=3.5, wordNgrams=2)

In [17]:
m.test("tweets.test")

(274, 0.9087591240875912, 0.9087591240875912)

#### autotune hyperparameters

In [18]:
m2 = fasttext.train_supervised(input="tweets.train", autotuneValidationFile='tweets.test', autotuneDuration=100)

In [19]:
m2.test("tweets.test")

(274, 0.927007299270073, 0.927007299270073)

save model

In [22]:
m2.save_model("autotuned_model.ftz")

In [23]:
print(f'Learning rate: {m2.lr}')
print(f'Epoch: {m2.epoch}')
print(f'WorNgrams: {m2.wordNgrams}')

Learning rate: 0.1
Epoch: 5
WorNgrams: 1
