In [1]:
import pandas as pd
import gensim.models as gsm
import phrase2vec as p2v
from utils import create_tweet_vectors, create_emoji_tweets, create_emoji_sentiment

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## Nacitanie dat a embeddingov
- E2V data
- emocontext data
- NLP modely

In [2]:
tweets = pd.read_csv('D:\Downloads\DP\data\e2v_data.csv')
# tweets = pd.read_csv('D:\Downloads\DP\data\e2v_data_emoji.csv')

In [111]:
tweets = pd.read_csv('D:\Downloads\DP\data\emocontext.csv')
# tweets = pd.read_csv('D:\Downloads\DP\data\emocontext_emoji.csv')

In [118]:
tweets.head()

Unnamed: 0,first,second,third,Label,Text,Sentiment
0,Don't worry I'm girl,hmm how do I know if you are,What's ur name?,others,Don't worry I'm girl hmm how do I know if you...,0.0
1,When did I?,saw many times i think -_-,No. I never saw you,angry,When did I? saw many times i think -_- No. I n...,0.0
2,By,by Google Chrome,Where you live,others,By by Google Chrome Where you live,0.0
3,U r ridiculous,I might be ridiculous but I am telling the truth.,U little disgusting whore,angry,U r ridiculous I might be ridiculous but I am ...,0.0
4,Just for time pass,wt do u do 4 a living then,Maybe,others,Just for time pass wt do u do 4 a living then ...,0.0


In [103]:
phrase_model_en = p2v.Phrase2Vec(
    300,
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_en.vec', binary=False),
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\emoji2vec300.bin', binary=True)
)

phrase_model_no_e2v_en = p2v.Phrase2Vec(
    300,
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_en.vec', binary=False)
)

17948 | INFO | loading projection weights from D:\Downloads\DP\word2vec300_en.vec
17948 | INFO | loaded (2000000, 300) matrix from D:\Downloads\DP\word2vec300_en.vec


In [None]:
# verzia pre slovencinu

# phrase_model_sk = p2v.Phrase2Vec(
#     300,
#     gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_sk.vec', binary=False),
#     gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\emoji2vec300.bin', binary=True)
# )

In [4]:
# vytvorenie subdatasetu s emotikonmi

# create_emoji_tweets(tweets, phrase_model_en.emojiVecModel, 'D:\Downloads\DP\data\e2v_data_emoji.csv')
# create_emoji_tweets(emocontext, phrase_model_en.emojiVecModel, 'D:\Downloads\DP\data\emocontext_emoji.csv')

## Priprava dat
- rozdelenie dat v zadanom pomere na podmnoziny, s rozdelenim labelov (predikovana hodnota)

In [144]:
# pri EMOCONTEXT su pouzite niektore nove emotikony -> nahradime
emo_mapping = {
    '🙂':'',
    '🙁':'😕',
    '🤣':'😂',
    '🤐':'😬',
    '🙄':'😏',
    '🍾':'🍹',
    '🤗':'☺',
    '🤔':'😏',
    '🤡':'🃏',
    '🛰':'',
    '🤑':'💰',
    '\u200d':'',
    '🤥':'😢',
    '🤕':'',
    '🖕':'',
    '🤦':'',
    '🕺':'',
    '🏕':'',
    '🙃':'',
    '🤒':'',
    '🏣':'',
    '🤷':'💁',
    '🤢':'',
    '🏖':'',
   '🏋':'',
    '🤘':'',
    '🤖':'',
    '⏸':''
}

for i, j in emo_mapping.items():
    tweets['Text'] = tweets['Text'].str.replace(i, j)

In [146]:
tweets = create_emoji_sentiment(tweets, phrase_model_en.emojiVecModel)

In [147]:
tweets = tweets.sample(frac=1).reset_index(drop=True)

train_x, train_y, valid_x, valid_y, test_x, test_y = create_tweet_vectors(tweets, phrase_model_no_e2v_en, 0.7, False)
train_x_e2v, _, valid_x_e2v, _, test_x_e2v, _ = create_tweet_vectors(tweets, phrase_model_en, 0.7, False)

train_x_sen, _, valid_x_sen, _, test_x_sen, _ = create_tweet_vectors(tweets, phrase_model_no_e2v_en, 0.7, True)
train_x_e2v_sen, _, valid_x_e2v_sen, _, test_x_e2v_sen, _ = create_tweet_vectors(tweets, phrase_model_en, 0.7, True)

## Vader - ziskanie baseline-u

In [3]:
vader = SentimentIntensityAnalyzer()

In [4]:
tweets['Lex'] = tweets['Text'].apply(lambda x: vader.polarity_scores(x)['compound'])
tweets['Lex_label'] = tweets['Lex'].apply(lambda x: 'Positive' if x > 0.05 else ('Neutral' if x > -0.05 else 'Negative')) 
# hranice 0.05 su dane od autora

In [9]:
metrics.accuracy_score(tweets['Label'], tweets['Lex_label'])

0.5431198625365717

In [10]:
metrics.f1_score(tweets['Label'], tweets['Lex_label'], average='weighted')

0.5400893160942533

## Skumanie chybovosti

In [8]:
# SVM
model = SVC(kernel='linear', C = 1.5)
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("SVM (): ", acc, f1)

SVM ():  0.6497929130234699 0.618961868646031


In [9]:
tweets.tail(10)

Unnamed: 0.1,Unnamed: 0,Id,Label,Text,Sentiment,Lex,Lex_label
10855,10855,514973201596547072,Positive,@JennaSeychel pls bring my backpack I need it ...,0.221,0.4404,Positive
10856,10856,514617822430134272,Positive,RT @michelleswid: life is good 😊😊😊😊,1.935,0.9774,Positive
10857,10857,511602407265284097,Negative,And I like it that way gives me less time to t...,0.664,0.3612,Positive
10858,10858,511666785636986880,Neutral,Love And Hip Hop Hollwood 🙌🙌 Some Cold Chicks ...,1.852,0.8807,Positive
10859,10859,513299447833690112,Neutral,"RT @_zach5: “@MrExposed: ""Female Intuition"" ht...",0.663,0.8271,Positive
10860,10860,514575489306882048,Positive,My dream job is to work at FedEx 😂,0.221,0.5994,Positive
10861,10861,513283400439181312,Positive,When @Shauski asks me to take a photo of him w...,0.63,0.6705,Positive
10862,10862,513082170295201792,Negative,@TiaSoSolid @MadiMego 😂😩 you threw that shit up!,-0.147,-0.4753,Negative
10863,10863,514262405506146304,Negative,The Stupid Things I Do 🙈 I Do It For You 😩😘😍,1.445,0.0772,Positive
10864,10864,514914422629101569,Positive,RT @SamiraIbrahimx: @MissKhynatNisa I'm CRYJNG...,1.326,0.9378,Positive


In [10]:
# prediction from SVM
predict[-10:]

array(['Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Negative', 'Positive', 'Positive'],
      dtype='<U8')

In [19]:
compare_df  = tweets[-len(predict_df):].reset_index(drop=True)

In [17]:
predict_df = pd.DataFrame(data=predict,columns=['Pred'])

In [22]:
compare_df = pd.concat([compare_df, predict_df], axis=1, sort=False)

In [29]:
compare_different = compare_df[compare_df['Label']!=compare_df['Pred']]

In [31]:
len(compare_different)

761

In [25]:
len(compare_df)

2173

In [33]:
compare_different.to_csv('D:\Downloads\DP\data\compare_results.csv', index=False, encoding="utf-8-sig")

## Predikcie (NB, RF, GBT)
- overenie uspesnosti modelov:
    - model (w2v)
    - model (w2v, e2v) 
    - model (w2v, senti)
    - model (w2v, e2v, senti)

In [105]:
def experiment_classic(model, train_x, train_y, test_x, test_y, text):
    model.fit(train_x, train_y)
    predict = model.predict(test_x)
    acc = metrics.accuracy_score(test_y, predict)
    f1 = metrics.f1_score(test_y, predict, average='weighted')
    print(text, ":   acc: ", acc," F1: ", f1)

In [106]:
def experiment(model, params, train_x, train_y, valid_x, valid_y, test_x, test_y, text):
    rand_optim = RandomizedSearchCV(model, param_distributions=params, 
                                          cv=5, random_state=42, n_jobs=-1, n_iter = 30, scoring='f1_weighted')
    rand_optim.fit(valid_x, valid_y)
    cls = rand_optim.best_estimator_
    cls.fit(train_x, train_y)
    predict = cls.predict(test_x)
    acc = metrics.accuracy_score(test_y, predict)
    f1 = metrics.f1_score(test_y, predict, average='weighted')
    print(text, ": acc: ", acc," F1: ", f1)

In [148]:
experiment_classic(GaussianNB(), train_x, train_y, test_x, test_y, "NB(.......)")
experiment_classic(GaussianNB(), train_x_e2v, train_y, test_x_e2v, test_y, "NB(e2v....)")
experiment_classic(GaussianNB(), train_x_sen, train_y, test_x_sen, test_y, "NB(sen....)")
experiment_classic(GaussianNB(), train_x_e2v_sen, train_y, test_x_e2v_sen, test_y, "NB(e2v+sen)")

NB(.......) :   acc:  0.40384615384615385  F1:  0.4148528449998235
NB(e2v....) :   acc:  0.40340406719717065  F1:  0.41230589749011687
NB(sen....) :   acc:  0.40738284703801947  F1:  0.41638254147705095
NB(e2v+sen) :   acc:  0.40893015030946067  F1:  0.41760145118807346


In [149]:
params = {
    'max_depth': stats.randint(1,15),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': stats.randint(1,4),
    'min_samples_split': stats.randint(2,5),
    'n_estimators': [20,60,100,150]
}
experiment(RandomForestClassifier(random_state=0), params, train_x, train_y, valid_x, valid_y, test_x, test_y, "RF(.......)")
experiment(RandomForestClassifier(random_state=0), params, train_x_e2v, train_y, valid_x_e2v, valid_y, test_x_e2v, test_y, "RF(e2v....)")
experiment(RandomForestClassifier(random_state=0), params, train_x_sen, train_y, valid_x_sen, valid_y, test_x_sen, test_y, "RF(sen....)")
experiment(RandomForestClassifier(random_state=0), params, train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y, "RF(e2v+sen)")

RF(.......) : acc:  0.580238726790451  F1:  0.5237994581107316
RF(e2v....) : acc:  0.5904067197170646  F1:  0.5355462551074105
RF(sen....) : acc:  0.600132625994695  F1:  0.5461245555800168




RF(e2v+sen) : acc:  0.6069849690539346  F1:  0.5515246932090708


In [150]:
params = {
    'max_depth': stats.randint(1,15),
    'learning_rate': [0.1,0.125,0.15,0.175,0.2,0.225,0.25],
    'min_samples_leaf': stats.randint(1,5),
    'min_samples_split' : stats.randint(2,10),
    'n_estimators' : [20,60,100,150]
}
experiment(GradientBoostingClassifier(), params, train_x, train_y, valid_x, valid_y, test_x, test_y, "GBT(.......)")
experiment(GradientBoostingClassifier(), params, train_x_e2v, train_y, valid_x_e2v, valid_y, test_x_e2v, test_y, "GBT(e2v....)")
experiment(GradientBoostingClassifier(), params, train_x_sen, train_y, valid_x_sen, valid_y, test_x_sen, test_y, "GBT(sen....)")
experiment(GradientBoostingClassifier(), params, train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y, "GBT(e2v+sen)")

GBT(.......) : acc:  0.6602564102564102  F1:  0.6440337488465383
GBT(e2v....) : acc:  0.6927497789566756  F1:  0.6838635656952176
GBT(sen....) : acc:  0.6799292661361627  F1:  0.6694423832323089




GBT(e2v+sen) : acc:  0.7013704686118479  F1:  0.6944036553872569


In [151]:
params = {
    'kernel': ['poly', 'rbf', 'sigmoid','linear'],
    'C': [1,1.25,1.5,1.75,2]
}
experiment(SVC(), params, train_x, train_y, valid_x, valid_y, test_x, test_y, "SVM(.......)")
experiment(SVC(), params, train_x_e2v, train_y, valid_x_e2v, valid_y, test_x_e2v, test_y, "SVM(e2v....)")
experiment(SVC(), params, train_x_sen, train_y, valid_x_sen, valid_y, test_x_sen, test_y, "SVM(sen....)")
experiment(SVC(), params, train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y, "SVM(e2v+sen)")



SVM(.......) : acc:  0.7031388152077808  F1:  0.6908288914366676




SVM(e2v....) : acc:  0.7155172413793104  F1:  0.7052029118913062




SVM(sen....) : acc:  0.7066755083996463  F1:  0.6962512386082086




SVM(e2v+sen) : acc:  0.7172855879752431  F1:  0.7077415400476647


---

## Predikcie (NB, RF, GBT, SVM) - ONLY EMOJI
- overenie uspesnosti modelov:
    - model (w2v)
    - model (w2v, e2v) 
    - model (w2v, senti)
    - model (w2v, e2v, senti)

In [26]:
experiment_classic(GaussianNB(), train_x, train_y, test_x, test_y, "NB(.......)")
experiment_classic(GaussianNB(), train_x_e2v, train_y, test_x_e2v, test_y, "NB(e2v....)")
experiment_classic(GaussianNB(), train_x_sen, train_y, test_x_sen, test_y, "NB(sen....)")
experiment_classic(GaussianNB(), train_x_e2v_sen, train_y, test_x_e2v_sen, test_y, "NB(e2v+sen)")

NB(.......) :   acc:  0.3905579399141631  F1:  0.35613741526254494
NB(e2v....) :   acc:  0.4034334763948498  F1:  0.3749824058998293
NB(sen....) :   acc:  0.4083384426732066  F1:  0.3797264388217386
NB(e2v+sen) :   acc:  0.42427958307786634  F1:  0.40238083174135625


In [27]:
params = {
    'max_depth': stats.randint(1,15),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': stats.randint(1,4),
    'min_samples_split': stats.randint(2,5),
    'n_estimators': [20,60,100,150]
}
experiment(RandomForestClassifier(random_state=0), params, train_x, train_y, valid_x, valid_y, test_x, test_y, "RF(.......)")
experiment(RandomForestClassifier(random_state=0), params, train_x_e2v, train_y, valid_x_e2v, valid_y, test_x_e2v, test_y, "RF(e2v....)")
experiment(RandomForestClassifier(random_state=0), params, train_x_sen, train_y, valid_x_sen, valid_y, test_x_sen, test_y, "RF(sen....)")
experiment(RandomForestClassifier(random_state=0), params, train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y, "RF(e2v+sen)")

RF(.......) : acc:  0.5548743102391172  F1:  0.5114888577276904
RF(e2v....) : acc:  0.5824647455548743  F1:  0.534839547416412
RF(sen....) : acc:  0.645003065603924  F1:  0.5964671151773155
RF(e2v+sen) : acc:  0.6443899448191294  F1:  0.5974335224381752


In [28]:
params = {
    'max_depth': stats.randint(1,15),
    'learning_rate': [0.1,0.125,0.15,0.175,0.2,0.225,0.25],
    'min_samples_leaf': stats.randint(1,5),
    'min_samples_split' : stats.randint(2,10),
    'n_estimators' : [20,60,100,150]
}
experiment(GradientBoostingClassifier(), params, train_x, train_y, valid_x, valid_y, test_x, test_y, "GBT(.......)")
experiment(GradientBoostingClassifier(), params, train_x_e2v, train_y, valid_x_e2v, valid_y, test_x_e2v, test_y, "GBT(e2v....)")
experiment(GradientBoostingClassifier(), params, train_x_sen, train_y, valid_x_sen, valid_y, test_x_sen, test_y, "GBT(sen....)")
experiment(GradientBoostingClassifier(), params, train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y, "GBT(e2v+sen)")

GBT(.......) : acc:  0.5757204169221337  F1:  0.5637526421096641
GBT(e2v....) : acc:  0.5947271612507664  F1:  0.5840173928497374
GBT(sen....) : acc:  0.6456161863887185  F1:  0.6301589259138404
GBT(e2v+sen) : acc:  0.6394849785407726  F1:  0.6233259728668625


In [30]:
params = {
    'kernel': ['poly', 'rbf', 'sigmoid','linear'],
    'C': [1,1.25,1.5,1.75,2]
}
experiment(SVC(), params, train_x, train_y, valid_x, valid_y, test_x, test_y, "SVM(.......)")
experiment(SVC(), params, train_x_e2v, train_y, valid_x_e2v, valid_y, test_x_e2v, test_y, "SVM(e2v....)")
experiment(SVC(), params, train_x_sen, train_y, valid_x_sen, valid_y, test_x_sen, test_y, "SVM(sen....)")
experiment(SVC(), params, train_x_e2v_sen, train_y, valid_x_e2v_sen, valid_y, test_x_e2v_sen, test_y, "SVM(e2v+sen)")

SVM(.......) : acc:  0.595340282035561  F1:  0.5570387671764473
SVM(e2v....) : acc:  0.6413243408951563  F1:  0.6101881788247093
SVM(sen....) : acc:  0.6603310852237891  F1:  0.6261564448749984
SVM(e2v+sen) : acc:  0.65113427345187  F1:  0.6218271188087804


### Testing

In [39]:
# grad. boosting trees
model = GradientBoostingClassifier()
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GradBoostTree (): ", acc, f1)

GradBoostTree ():  0.5789473684210527 0.5676951080390289


In [52]:
metrics.f1_score(test_y, predict, average='weighted')

0.5676951080390289

In [53]:
metrics.f1_score(test_y, predict, average='micro')
# na stackoverflowe je pekne vysvetlene kedy a preco je to rovnake ako ACC

0.5789473684210527

In [9]:
# SVM
model = SVC(kernel='linear', C = 1.5)
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("SVM (): ", acc, f1)

SVM ():  0.6497929130234699 0.618961868646031


In [26]:
# baseline
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest: ", acc, f1)

RandomForest:  0.519558214450069 0.4884329060108204


In [None]:
from sklearn import model_selection
score = model_selection.cross_val_score(RandomForestClassifier(n_estimators=60), train_x, train_y, cv=5)
print(score.mean(), score.std()*2)
score = model_selection.cross_val_score(RandomForestClassifier(n_estimators=60), train_x_e2v, train_y, cv=5)
print(score.mean(), score.std()*2)

In [39]:
limit = int(len(train_x)*0.8)
valid_x = train_x[limit:]
train_x = train_x[:limit]
valid_y = train_y[limit:]
train_y = train_y[:limit]

In [41]:
params = {
    'max_depth': stats.randint(1,15),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': stats.randint(1,4),
    'min_samples_split': stats.randint(2,5),
    'n_estimators': [20,60,100,150]
}

random_optimization = RandomizedSearchCV(RandomForestClassifier(random_state=0), param_distributions=params, 
                                          cv=5, random_state=42, n_jobs=-1, n_iter = 30, scoring='f1_weighted')

random_optimization.fit(valid_x, valid_y)


# RandomForest
cls = random_optimization.best_estimator_
cls.fit(train_x, train_y)

print("---RandomForest---")         
predict = cls.predict(test_x)
print("Accuracy: ", metrics.accuracy_score(predict, test_y))
print(metrics.f1_score(test_y, predict, average='weighted'))
print()
print(cls.get_params)

---RandomForest---
Accuracy:  0.5773219814241486
0.557900552605684

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=14, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)>
