In [1]:
import pandas as pd
import gensim.models as gsm
import phrase2vec as p2v
from utils import create_tweet_vectors, create_emoji_tweets, create_emoji_sentiment

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## Nacitanie dat a embeddingov
- E2V data
- emocontext data
- anasent data
- NLP modely

### 1. E2V dataset

In [2]:
# tweets = pd.read_csv('D:\Downloads\DP\data\e2v_data.csv')
tweets = pd.read_csv('D:\Downloads\DP\data\e2v_data_emoji.csv')

### 2. Emocontext dataset

In [31]:
# tweets = pd.read_csv('D:\Downloads\DP\data\emocontext.csv')
tweets = pd.read_csv('D:\Downloads\DP\data\emocontext_emoji.csv')

# tweets_train = pd.read_csv('D:/Downloads/DP/data/emocontext.csv')
# tweets_dev = pd.read_csv('D:/Downloads/DP/data/emocontext_dev.csv')
# tweets_test = pd.read_csv('D:/Downloads/DP/data/emocontext_test.csv')

# tweets = pd.concat([tweets_train,tweets_dev,tweets_test])
# tweets = tweets.reset_index(drop=True)

### 3. Anasent dataset (SK)

In [2]:
# tweets = pd.read_csv('D:/Downloads/DP/data/anasent.csv')
tweets = pd.read_csv('D:/Downloads/DP/data/anasent_emoji.csv')

In [3]:
len(tweets)

84321

In [None]:
# vektory pre anglictinu

phrase_model_en = p2v.Phrase2Vec(
    300,
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_en.vec', binary=False),
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\emoji2vec300.bin', binary=True)
)

phrase_model_no_e2v_en = p2v.Phrase2Vec(
    300,
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_en.vec', binary=False)
)

In [4]:
# vektory pre slovencinu

phrase_model_sk = p2v.Phrase2Vec(
    300,
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_sk.vec', binary=False),
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\emoji2vec300.bin', binary=True)
)

phrase_model_no_e2v_sk = p2v.Phrase2Vec(
    300,
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_sk.vec', binary=False)
)

18512 | INFO | loading projection weights from D:\Downloads\DP\word2vec300_sk.vec
18512 | INFO | loaded (2000000, 300) matrix from D:\Downloads\DP\word2vec300_sk.vec
18512 | INFO | loading projection weights from D:\Downloads\DP\emoji2vec300.bin
18512 | INFO | loaded (1661, 300) matrix from D:\Downloads\DP\emoji2vec300.bin
18512 | INFO | loading projection weights from D:\Downloads\DP\word2vec300_sk.vec
18512 | INFO | loaded (2000000, 300) matrix from D:\Downloads\DP\word2vec300_sk.vec


In [30]:
# vytvorenie subdatasetu s emotikonmi

# create_emoji_tweets(tweets, phrase_model_en.emojiVecModel, 'D:\Downloads\DP\data\e2v_data_emoji.csv')
# create_emoji_tweets(tweets, phrase_model_en.emojiVecModel, 'D:\Downloads\DP\data\emocontext_emoji.csv')

## Priprava dat
- rozdelenie dat v zadanom pomere na podmnoziny, s rozdelenim labelov (predikovana hodnota)

In [52]:
# pri EMOCONTEXT su pouzite niektore nove emotikony -> nahradime
emo_mapping = {
    '🙁':'😕',
    '🤣':'😂',
    '🤐':'😬',
    '🙄':'😏',
    '🍾':'🍹',
    '🤗':'☺',
    '🤔':'😏',
    '🤡':'🃏',
    '🤑':'💰',
    '\u200d':'',
    '🤥':'😢',
    '🤷':'💁',
}

for i, j in emo_mapping.items():
    tweets['Text'] = tweets['Text'].str.replace(i, j)

In [5]:
tweets = create_emoji_sentiment(tweets, phrase_model_sk.emojiVecModel)

In [None]:
# anglictina

tweets = tweets.sample(frac=1).reset_index(drop=True)

train_x, train_y, valid_x, valid_y, test_x, test_y = create_tweet_vectors(tweets, phrase_model_no_e2v_en, 0.7, False)
train_x_e2v, _, valid_x_e2v, _, test_x_e2v, _ = create_tweet_vectors(tweets, phrase_model_en, 0.7, False)

train_x_sen, _, valid_x_sen, _, test_x_sen, _ = create_tweet_vectors(tweets, phrase_model_no_e2v_en, 0.7, True)
train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y = create_tweet_vectors(tweets, phrase_model_en, 0.7, True)

In [6]:
# slovencina

tweets = tweets.sample(frac=1).reset_index(drop=True)

train_x, train_y, valid_x, valid_y, test_x, test_y = create_tweet_vectors(tweets, phrase_model_no_e2v_sk, 0.7, False)
train_x_e2v, _, valid_x_e2v, _, test_x_e2v, _ = create_tweet_vectors(tweets, phrase_model_sk, 0.7, False)

train_x_sen, _, valid_x_sen, _, test_x_sen, _ = create_tweet_vectors(tweets, phrase_model_no_e2v_sk, 0.7, True)
train_x_e2v_sen, _, valid_x_e2v_sen, _, test_x_e2v_sen, _ = create_tweet_vectors(tweets, phrase_model_sk, 0.7, True)

## Vader - ziskanie baseline-u
- vhodne len pre e2v dataset, kvoli rozdeleniu tried

In [7]:
vader = SentimentIntensityAnalyzer()

In [8]:
tweets['Lex'] = tweets['Text'].apply(lambda x: vader.polarity_scores(x)['compound'])
tweets['Lex_label'] = tweets['Lex'].apply(lambda x: 'Positive' if x > 0.05 else ('Neutral' if x > -0.05 else 'Negative')) 
# hranice 0.05 su dane od autora

In [9]:
metrics.accuracy_score(tweets['Label'], tweets['Lex_label'])

0.4498642093903061

In [10]:
metrics.f1_score(tweets['Label'], tweets['Lex_label'], average='micro')

0.4498642093903061

## Predikcie (NB, RF, GBT)
- overenie uspesnosti modelov:
    - model (w2v)
    - model (w2v, e2v) 
    - model (w2v, senti)
    - model (w2v, e2v, senti)

In [7]:
def experiment_classic(model, train_x, train_y, test_x, test_y, text):
    model.fit(train_x, train_y)
    predict = model.predict(test_x)
    acc = metrics.accuracy_score(test_y, predict)
    f1 = metrics.f1_score(test_y, predict, average='micro')
    print(text, ":   acc: ", acc," F1: ", f1)

In [8]:
def experiment(model, params, train_x, train_y, valid_x, valid_y, test_x, test_y, text):
    rand_optim = RandomizedSearchCV(model, param_distributions=params, 
                                          cv=5, random_state=42, n_jobs=-1, n_iter = 20, scoring='f1_micro')
    rand_optim.fit(valid_x, valid_y)
    cls = rand_optim.best_estimator_
    cls.fit(train_x, train_y)
    predict = cls.predict(test_x)
    acc = metrics.accuracy_score(test_y, predict)
    f1 = metrics.f1_score(test_y, predict, average='micro')
    print(text, ": acc: ", acc," F1: ", f1)

In [13]:
experiment_classic(GaussianNB(), train_x, train_y, test_x, test_y, "NB(.......)")
experiment_classic(GaussianNB(), train_x_e2v, train_y, test_x_e2v, test_y, "NB(e2v....)")
experiment_classic(GaussianNB(), train_x_sen, train_y, test_x_sen, test_y, "NB(sen....)")
experiment_classic(GaussianNB(), train_x_e2v_sen, train_y, test_x_e2v_sen, test_y, "NB(e2v+sen)")

NB(.......) :   acc:  0.4869950193691201  F1:  0.4869950193691201
NB(e2v....) :   acc:  0.5383034231955095  F1:  0.5383034231955095
NB(sen....) :   acc:  0.49055261285477114  F1:  0.49055261285477114
NB(e2v+sen) :   acc:  0.5406751521859435  F1:  0.5406751521859435


In [14]:
params = {
    'max_depth': stats.randint(1,15),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': stats.randint(1,4),
    'min_samples_split': stats.randint(2,5),
    'n_estimators': [20,60,100,150]
}
experiment(RandomForestClassifier(random_state=0), params, train_x, train_y, valid_x, valid_y, test_x, test_y, "RF(.......)")
experiment(RandomForestClassifier(random_state=0), params, train_x_e2v, train_y, valid_x_e2v, valid_y, test_x_e2v, test_y, "RF(e2v....)")
experiment(RandomForestClassifier(random_state=0), params, train_x_sen, train_y, valid_x_sen, valid_y, test_x_sen, test_y, "RF(sen....)")
experiment(RandomForestClassifier(random_state=0), params, train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y, "RF(e2v+sen)")

RF(.......) : acc:  0.572377263024745  F1:  0.572377263024745
RF(e2v....) : acc:  0.6494584552138509  F1:  0.6494584552138509
RF(sen....) : acc:  0.6461380346272433  F1:  0.6461380346272433
RF(e2v+sen) : acc:  0.683295122144043  F1:  0.683295122144043


In [None]:
params = {
    'max_depth': stats.randint(1,15),
    'learning_rate': [0.1,0.125,0.15,0.175,0.2,0.225,0.25],
    'min_samples_leaf': stats.randint(1,5),
    'min_samples_split' : stats.randint(2,10),
    'n_estimators' : [20,60,100,150]
}
experiment(GradientBoostingClassifier(), params, train_x, train_y, valid_x, valid_y, test_x, test_y, "GBT(.......)")
experiment(GradientBoostingClassifier(), params, train_x_e2v, train_y, valid_x_e2v, valid_y, test_x_e2v, test_y, "GBT(e2v....)")
experiment(GradientBoostingClassifier(), params, train_x_sen, train_y, valid_x_sen, valid_y, test_x_sen, test_y, "GBT(sen....)")
experiment(GradientBoostingClassifier(), params, train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y, "GBT(e2v+sen)")

GBT(.......) : acc:  0.6132500592932247  F1:  0.6132500592932247


In [None]:
params = {
    'kernel': ['poly', 'rbf', 'sigmoid','linear'],
    'C': [1,1.25,1.5,1.75,2]
}
experiment(SVC(), params, train_x, train_y, valid_x, valid_y, test_x, test_y, "SVM(.......)")
experiment(SVC(), params, train_x_e2v, train_y, valid_x_e2v, valid_y, test_x_e2v, test_y, "SVM(e2v....)")
experiment(SVC(), params, train_x_sen, train_y, valid_x_sen, valid_y, test_x_sen, test_y, "SVM(sen....)")
experiment(SVC(), params, train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y, "SVM(e2v+sen)")

---

## Ulozenie chybovych dat na dalsiu analyzu

In [143]:
params = {
    'kernel': ['poly', 'rbf', 'sigmoid','linear'],
    'C': [1,1.25,1.5,1.75,2]
}
rand_optim = RandomizedSearchCV(SVC(), param_distributions=params, 
                                          cv=5, random_state=42, n_jobs=-1, n_iter = 20, scoring='f1_micro')
rand_optim.fit(valid_x_e2v_sen, valid_y)
cls = rand_optim.best_estimator_
cls.fit(train_x_e2v_sen, train_y)
predict = cls.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='micro')
print("acc: ", acc," F1: ", f1)



acc:  0.6339668914776211  F1:  0.5992901752670984


In [144]:
predict_df = pd.DataFrame(data=predict,columns=['Pred'])
compare_df = tweets[-len(predict_df):].reset_index(drop=True)
compare_df = pd.concat([compare_df, predict_df], axis=1, sort=False)
different = compare_df[compare_df['Label']!=compare_df['Pred']]
different = different.reset_index(drop=True)
print(f"Different:", len(different), "from all:", len(compare_df))

Different: 597 from all: 1631


In [145]:
different.to_csv('D:\Downloads\DP\data\errors_e2v.csv', index=False, encoding="utf-8-sig")

### Testing

In [39]:
# grad. boosting trees
model = GradientBoostingClassifier()
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='micro')
print("GradBoostTree (): ", acc, f1)

GradBoostTree ():  0.5789473684210527 0.5676951080390289


In [52]:
metrics.f1_score(test_y, predict, average='micro')

0.5676951080390289

In [53]:
metrics.f1_score(test_y, predict, average='micro')
# na stackoverflowe je pekne vysvetlene kedy a preco je to rovnake ako ACC

0.5789473684210527

In [9]:
# SVM
model = SVC(kernel='linear', C = 1.5)
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='micro')
print("SVM (): ", acc, f1)

SVM ():  0.6497929130234699 0.618961868646031


In [26]:
# baseline
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='micro')
print("RandomForest: ", acc, f1)

RandomForest:  0.519558214450069 0.4884329060108204


In [None]:
from sklearn import model_selection
score = model_selection.cross_val_score(RandomForestClassifier(n_estimators=60), train_x, train_y, cv=5)
print(score.mean(), score.std()*2)
score = model_selection.cross_val_score(RandomForestClassifier(n_estimators=60), train_x_e2v, train_y, cv=5)
print(score.mean(), score.std()*2)

In [39]:
limit = int(len(train_x)*0.8)
valid_x = train_x[limit:]
train_x = train_x[:limit]
valid_y = train_y[limit:]
train_y = train_y[:limit]

In [41]:
params = {
    'max_depth': stats.randint(1,15),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': stats.randint(1,4),
    'min_samples_split': stats.randint(2,5),
    'n_estimators': [20,60,100,150]
}

random_optimization = RandomizedSearchCV(RandomForestClassifier(random_state=0), param_distributions=params, 
                                          cv=5, random_state=42, n_jobs=-1, n_iter = 30, scoring='f1_micro')

random_optimization.fit(valid_x, valid_y)


# RandomForest
cls = random_optimization.best_estimator_
cls.fit(train_x, train_y)

print("---RandomForest---")         
predict = cls.predict(test_x)
print("Accuracy: ", metrics.accuracy_score(predict, test_y))
print(metrics.f1_score(test_y, predict, average='micro'))
print()
print(cls.get_params)

---RandomForest---
Accuracy:  0.5773219814241486
0.557900552605684

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=14, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)>
