In [1]:
import pandas as pd
import gensim.models as gsm
import phrase2vec as p2v
from utils import create_tweet_vectors, create_emoji_tweets, create_emoji_sentiment

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

from emosent import get_emoji_sentiment_rank
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## Nacitanie dat a embeddingov

In [2]:
# tweets = pd.read_csv('D:\Downloads\DP\data\e2v_data.csv')
tweets = pd.read_csv('D:\Downloads\DP\data\e2v_data_emoji.csv')

In [3]:
phrase_model_en = p2v.Phrase2Vec(
    300,
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_en.vec', binary=False),
    gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\emoji2vec300.bin', binary=True)
)

# phrase_model_no_e2v_en = p2v.Phrase2Vec(
#     300,
#     gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_en.vec', binary=False)
# )

19520 | INFO | loading projection weights from D:\Downloads\DP\word2vec300_en.vec
19520 | INFO | loaded (2000000, 300) matrix from D:\Downloads\DP\word2vec300_en.vec
19520 | INFO | loading projection weights from D:\Downloads\DP\emoji2vec300.bin
19520 | INFO | loaded (1661, 300) matrix from D:\Downloads\DP\emoji2vec300.bin


In [None]:
# pre SK potom iny notebook

# phrase_model_sk = p2v.Phrase2Vec(
#     300,
#     gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\word2vec300_sk.vec', binary=False),
#     gsm.KeyedVectors.load_word2vec_format('D:\Downloads\DP\emoji2vec300.bin', binary=True)
# )

In [4]:
# create_emoji_tweets(tweets, phrase_model_en.emojiVecModel, 'D:\Downloads\DP\data\e2v_data_emoji.csv')

## Priprava dat
- rozdelenie dat v zadanom pomere na trenovaciu / testovaniu mnozinu, s rozdelenim labelov (predikovana hodnota)

In [4]:
tweets = create_emoji_sentiment(tweets, phrase_model_en.emojiVecModel)

In [5]:
# tweets = tweets.sample(frac=1).reset_index(drop=True)

# train_x, train_y, test_x, test_y = create_tweet_vectors(tweets, phrase_model_no_e2v_en, 0.8, False)
# train_x_e2v, _, test_x_e2v, _ = create_tweet_vectors(tweets, phrase_model_en, 0.8, False)

# train_x_sen, _, test_x_sen, _ = create_tweet_vectors(tweets, phrase_model_no_e2v_en, 0.8, True)
train_x_e2v_sen, train_y, test_x_e2v_sen, test_y = create_tweet_vectors(tweets, phrase_model_en, 0.8, True)

## Vader - ziskanie baseline-u

In [6]:
vader = SentimentIntensityAnalyzer()

In [7]:
tweets['Lex'] = tweets['Text'].apply(lambda x: vader.polarity_scores(x)['compound'])
tweets['Lex_label'] = tweets['Lex'].apply(lambda x: 'Positive' if x > 0.05 else ('Neutral' if x > -0.05 else 'Negative')) 
# hranice 0.05 su dane od autora

In [11]:
metrics.accuracy_score(tweets['Label'], tweets['Lex_label'])

0.5722963644730787

In [12]:
metrics.f1_score(tweets['Label'], tweets['Lex_label'], average='weighted')

0.538508232708095

## Skumanie chybovosti

In [8]:
# SVM
model = SVC(kernel='linear', C = 1.5)
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("SVM (): ", acc, f1)

SVM ():  0.6497929130234699 0.618961868646031


In [9]:
tweets.tail(10)

Unnamed: 0.1,Unnamed: 0,Id,Label,Text,Sentiment,Lex,Lex_label
10855,10855,514973201596547072,Positive,@JennaSeychel pls bring my backpack I need it ...,0.221,0.4404,Positive
10856,10856,514617822430134272,Positive,RT @michelleswid: life is good 😊😊😊😊,1.935,0.9774,Positive
10857,10857,511602407265284097,Negative,And I like it that way gives me less time to t...,0.664,0.3612,Positive
10858,10858,511666785636986880,Neutral,Love And Hip Hop Hollwood 🙌🙌 Some Cold Chicks ...,1.852,0.8807,Positive
10859,10859,513299447833690112,Neutral,"RT @_zach5: “@MrExposed: ""Female Intuition"" ht...",0.663,0.8271,Positive
10860,10860,514575489306882048,Positive,My dream job is to work at FedEx 😂,0.221,0.5994,Positive
10861,10861,513283400439181312,Positive,When @Shauski asks me to take a photo of him w...,0.63,0.6705,Positive
10862,10862,513082170295201792,Negative,@TiaSoSolid @MadiMego 😂😩 you threw that shit up!,-0.147,-0.4753,Negative
10863,10863,514262405506146304,Negative,The Stupid Things I Do 🙈 I Do It For You 😩😘😍,1.445,0.0772,Positive
10864,10864,514914422629101569,Positive,RT @SamiraIbrahimx: @MissKhynatNisa I'm CRYJNG...,1.326,0.9378,Positive


In [10]:
# prediction from SVM
predict[-10:]

array(['Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Negative', 'Positive', 'Positive'],
      dtype='<U8')

In [19]:
compare_df  = tweets[-len(predict_df):].reset_index(drop=True)

In [17]:
predict_df = pd.DataFrame(data=predict,columns=['Pred'])

In [22]:
# take last N rows = test_x
# concat with predictions
compare_df = pd.concat([compare_df, predict_df], axis=1, sort=False)

In [29]:
compare_different = compare_df[compare_df['Label']!=compare_df['Pred']]

In [31]:
len(compare_different)

761

In [25]:
len(compare_df)

2173

In [33]:
compare_different.to_csv('D:\Downloads\DP\data\compare_results.csv', index=False, encoding="utf-8-sig")

## Predikcie (NB, RF, GBT)
- overenie uspesnosti modelov:
    - model (w2v)
    - model (w2v, e2v) 
    - model (w2v, senti)
    - model (w2v, e2v, senti)

In [7]:
def experiment(model, train_x, train_y, test_x, test_y, text):
    model.fit(train_x, train_y)
    predict = model.predict(test_x)
    acc = metrics.accuracy_score(test_y, predict)
    f1 = metrics.f1_score(test_y, predict, average='weighted')
    print(text, ":   acc: ", acc," F1: ", f1)

In [10]:
experiment(GaussianNB(), train_x, train_y, test_x, test_y, "NB()\t\t")
experiment(GaussianNB(), train_x_e2v, train_y, test_x_e2v, test_y, "NB(e2v)\t\t")
experiment(GaussianNB(), train_x_sen, train_y, test_x_sen, test_y, "NB(sen)\t\t")
experiment(GaussianNB(), train_x_e2v_sen, train_y, test_x_e2v_sen, test_y, "NB(e2v+sen)\t")

NB()		 :   acc:  0.42995356037151705  F1:  0.42778169518857045
NB(e2v)		 :   acc:  0.42701238390092877  F1:  0.4238146838404375
NB(sen)		 :   acc:  0.4310371517027864  F1:  0.42789735048711697
NB(e2v+sen)	 :   acc:  0.42863777089783284  F1:  0.42509403937121487


In [9]:
model = GaussianNB()
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("NaiveBayes (): ", acc, f1)

NaiveBayes ():  0.44187306501547985 0.43706657918652836


In [10]:
model = GaussianNB()
model.fit(train_x_e2v, train_y)

predict = model.predict(test_x_e2v)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("NaiveBayes (e2v): ", acc, f1)

NaiveBayes (e2v):  0.4401702786377709 0.4354977083251953


In [11]:
model = GaussianNB()
model.fit(train_x_sen, train_y)

predict = model.predict(test_x_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("NaiveBayes (senti): ", acc, f1)

NaiveBayes (senti):  0.443343653250774 0.43837453445499663


In [12]:
model = GaussianNB()
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("NaiveBayes (e2v, senti): ", acc, f1)

NaiveBayes (e2v, senti):  0.4425696594427245 0.4379606582974239


---

In [13]:
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest (): ", acc, f1)

RandomForest ():  0.5706656346749226 0.5554340545783614


In [14]:
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x_e2v, train_y)

predict = model.predict(test_x_e2v)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest (e2v): ", acc, f1)

RandomForest (e2v):  0.5743034055727554 0.5597411955080135


In [15]:
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x_sen, train_y)

predict = model.predict(test_x_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest (senti): ", acc, f1)

RandomForest (senti):  0.5856037151702786 0.5719939559044941


In [16]:
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest (e2v, senti): ", acc, f1)

RandomForest (e2v, senti):  0.583513931888545 0.5705112016683248


---

In [37]:
model = GradientBoostingClassifier()
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GBTree (): ", acc, f1)

GBTree ():  0.5951238390092879 0.5859329263823542


In [38]:
model = GradientBoostingClassifier()
model.fit(train_x_e2v, train_y)

predict = model.predict(test_x_e2v)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GBTree (e2v): ", acc, f1)

GBTree (e2v):  0.5955882352941176 0.5861045678115423


In [39]:
model = GradientBoostingClassifier()
model.fit(train_x_sen, train_y)

predict = model.predict(test_x_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GBTree (senti): ", acc, f1)

GBTree (senti):  0.6089009287925696 0.6013019094552711


In [40]:
model = GradientBoostingClassifier()
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GBTree (e2v, senti): ", acc, f1)

GBTree (e2v, senti):  0.6095201238390093 0.6016871407966421


In [14]:
experiment(SVC(kernel='linear', C = 1.5), train_x, train_y, test_x, test_y, "SVM()\t\t")
experiment(SVC(kernel='linear', C = 1.5), train_x_e2v, train_y, test_x_e2v, test_y, "SVM(e2v)\t")
experiment(SVC(kernel='linear', C = 1.5), train_x_sen, train_y, test_x_sen, test_y, "SVM(sen)\t")
experiment(SVC(kernel='linear', C = 1.5), train_x_e2v_sen, train_y, test_x_e2v_sen, test_y, "SVM(e2v+sen)\t")

SVM()		 :   acc:  0.60828173374613  F1:  0.5927923262036808
SVM(e2v)	 :   acc:  0.6196594427244582  F1:  0.6074330937009805
SVM(sen)	 :   acc:  0.6194272445820433  F1:  0.6068696909698613
SVM(e2v+sen)	 :   acc:  0.621594427244582  F1:  0.6098503611301096


## Predikcie (NB, RF, GBT, SVM) - ONLY EMOJI
- overenie uspesnosti modelov:
    - model (w2v)
    - model (w2v, e2v) 
    - model (w2v, senti)
    - model (w2v, e2v, senti)

In [21]:
model = GaussianNB()
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("NaiveBayes (): ", acc, f1)

NaiveBayes ():  0.37551771744132534 0.3420771480278876


In [22]:
model = GaussianNB()
model.fit(train_x_e2v, train_y)

predict = model.predict(test_x_e2v)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("NaiveBayes (e2v): ", acc, f1)

NaiveBayes (e2v):  0.3888633225954901 0.3621191730906383


In [23]:
model = GaussianNB()
model.fit(train_x_sen, train_y)

predict = model.predict(test_x_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("NaiveBayes (senti): ", acc, f1)

NaiveBayes (senti):  0.393005062126093 0.36410772551971843


In [24]:
model = GaussianNB()
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("NaiveBayes (e2v, senti): ", acc, f1)

NaiveBayes (e2v, senti):  0.4063506672802577 0.3826765102600698


---

In [25]:
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest (): ", acc, f1)

RandomForest ():  0.5250805338242062 0.49820134867490373


In [26]:
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x_e2v, train_y)

predict = model.predict(test_x_e2v)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest (e2v): ", acc, f1)

RandomForest (e2v):  0.548550391164289 0.5192664818626386


In [27]:
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x_sen, train_y)

predict = model.predict(test_x_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest (senti): ", acc, f1)

RandomForest (senti):  0.6028531983433042 0.5735423820802442


In [28]:
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest (e2v, senti): ", acc, f1)

RandomForest (e2v, senti):  0.6198803497468937 0.5887826698314691


---

In [30]:
model = GradientBoostingClassifier()
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GBTree (): ", acc, f1)

GBTree ():  0.553612517257248 0.5292395475153364


In [31]:
model = GradientBoostingClassifier()
model.fit(train_x_e2v, train_y)

predict = model.predict(test_x_e2v)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GBTree (e2v): ", acc, f1)

GBTree (e2v):  0.5908881730326737 0.5685402524068177


In [32]:
model = GradientBoostingClassifier()
model.fit(train_x_sen, train_y)

predict = model.predict(test_x_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GBTree (senti): ", acc, f1)

GBTree (senti):  0.6428900138057985 0.6147398999311509


In [33]:
model = GradientBoostingClassifier()
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GBTree (e2v, senti): ", acc, f1)

GBTree (e2v, senti):  0.6277036355269213 0.5977394598395912


---

In [9]:
experiment(SVC(kernel='linear', C = 1.5), train_x, train_y, test_x, test_y, "SVM()\t\t")
experiment(SVC(kernel='linear', C = 1.5), train_x_e2v, train_y, test_x_e2v, test_y, "SVM(e2v)\t")
experiment(SVC(kernel='linear', C = 1.5), train_x_sen, train_y, test_x_sen, test_y, "SVM(sen)\t")
experiment(SVC(kernel='linear', C = 1.5), train_x_e2v_sen, train_y, test_x_e2v_sen, test_y, "SVM(e2v+sen)\t")

SVM()		 :   acc:  0.553612517257248  F1:  0.5092475874470038
SVM(e2v)		 :   acc:  0.6079153244362633  F1:  0.5713656959478562
SVM(sen)		 :   acc:  0.621260929590428  F1:  0.5732609645388096
SVM(e2v+sen)	 :   acc:  0.6332259549010585  F1:  0.5943614039588038


### Testing

In [10]:
get_emoji_sentiment_rank('😂')['sentiment_score']

0.221

In [50]:
# grad. boosting trees
model = GradientBoostingClassifier()
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("GradBoostTree (): ", acc, f1)

RandomForest ():  0.6062693498452012 0.5962205000538585


In [9]:
# SVM
model = SVC(kernel='linear', C = 1.5)
model.fit(train_x_e2v_sen, train_y)

predict = model.predict(test_x_e2v_sen)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("SVM (): ", acc, f1)

SVM ():  0.6497929130234699 0.618961868646031


In [26]:
# baseline
model = RandomForestClassifier(n_estimators=60)
model.fit(train_x, train_y)

predict = model.predict(test_x)
acc = metrics.accuracy_score(test_y, predict)
f1 = metrics.f1_score(test_y, predict, average='weighted')
print("RandomForest: ", acc, f1)

RandomForest:  0.519558214450069 0.4884329060108204


In [12]:
limit = int(len(train_x)*0.8)
valid_x = train_x[limit:]
train_x = train_x[:limit]
valid_y = train_y[limit:]
train_y = train_y[:limit]

In [19]:
params = {
    'max_depth': stats.randint(1,15),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': stats.randint(1,4),
    'min_samples_split': stats.randint(2,5),
    'n_estimators': [20,60,100,150]
}

random_optimization = RandomizedSearchCV(RandomForestClassifier(random_state=0), param_distributions=params, 
                                          cv=10, random_state=42, n_jobs=-1, n_iter = 30, scoring='f1_weighted')

random_optimization.fit(valid_x, valid_y)


# RandomForest
cls = random_optimization.best_estimator_
cls.fit(train_x, train_y)

print("---RandomForest---")         
predict = cls.predict(test_x)
print("Accuracy: ", metrics.accuracy_score(predict, test_y))
print(metrics.f1_score(test_y, predict, average='weighted'))
print()
print(cls.get_params)

---RandomForest---
Accuracy:  0.5741486068111455
0.5546377512459062

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=14, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)>
