In [None]:
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import Pipeline

nltk.download("popular")
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()
df = pd.read_csv("train.txt", sep=";")
df

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/seara/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/seara/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/seara/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/seara/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/seara/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/seara/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!

Unnamed: 0,words,mood
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [None]:
def process(sentence):
    tokens = word_tokenize(sentence)
    return " ".join([lemmatizer.lemmatize(w) for w in tokens])

In [None]:
df["words_processed"] = df["words"].apply(process)

In [None]:
encoder = preprocessing.LabelEncoder()

In [None]:
df["mood_processed"] = encoder.fit_transform(df["mood"])

In [None]:
df

Unnamed: 0,words,mood,words_processed,mood_processed
0,i didnt feel humiliated,sadness,i didnt feel humiliated,4
1,i can go from feeling so hopeless to so damned...,sadness,i can go from feeling so hopeless to so damned...,4
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing a minute to post i feel greedy wrong,0
3,i am ever feeling nostalgic about the fireplac...,love,i am ever feeling nostalgic about the fireplac...,3
4,i am feeling grouchy,anger,i am feeling grouchy,0
...,...,...,...,...
15995,i just had a very brief time in the beanbag an...,sadness,i just had a very brief time in the beanbag an...,4
15996,i am now turning and i feel pathetic that i am...,sadness,i am now turning and i feel pathetic that i am...,4
15997,i feel strong and good overall,joy,i feel strong and good overall,2
15998,i feel like this was such a rude comment and i...,anger,i feel like this wa such a rude comment and im...,0


In [None]:
X = df["words_processed"]
y = df["mood_processed"]
clf = Pipeline(
    [
        ("vec", TfidfVectorizer(ngram_range=(1, 2))),
        ("clf", LogisticRegression(n_jobs=-1)),
    ]
)
clf.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
clf.predict_proba(X)

array([[0.06482722, 0.04761966, 0.12796872, 0.0300404 , 0.70573214,
        0.02381187],
       [0.12512628, 0.11360437, 0.23576961, 0.05641637, 0.44297503,
        0.02610835],
       [0.45393376, 0.0967572 , 0.19479018, 0.05380643, 0.17427396,
        0.02643847],
       ...,
       [0.06327947, 0.06048753, 0.67144781, 0.04542168, 0.13446833,
        0.02489517],
       [0.28570239, 0.04631833, 0.45836394, 0.04822022, 0.1371273 ,
        0.02426782],
       [0.10716169, 0.1024863 , 0.12253985, 0.04035332, 0.60239388,
        0.02506497]])

In [None]:
roc_auc_score(y, clf.predict_proba(X), multi_class="ovr")

0.9990991953878887

In [None]:
test_df = pd.read_csv("test.txt", sep=";")

In [None]:
test_df["words_processed"] = test_df["words"].apply(process)
test_df["mood_processed"] = encoder.transform(test_df["mood"])

In [None]:
test_df

Unnamed: 0,words,mood,words_processed,mood_processed
0,im feeling rather rotten so im not very ambiti...,sadness,im feeling rather rotten so im not very ambiti...,4
1,im updating my blog because i feel shitty,sadness,im updating my blog because i feel shitty,4
2,i never make her separate from me because i do...,sadness,i never make her separate from me because i do...,4
3,i left with my bouquet of red and yellow tulip...,joy,i left with my bouquet of red and yellow tulip...,2
4,i was feeling a little vain when i did this one,sadness,i wa feeling a little vain when i did this one,4
...,...,...,...,...
1995,i just keep feeling like someone is being unki...,anger,i just keep feeling like someone is being unki...,0
1996,im feeling a little cranky negative after this...,anger,im feeling a little cranky negative after this...,0
1997,i feel that i am useful to my people and that ...,joy,i feel that i am useful to my people and that ...,2
1998,im feeling more comfortable with derby i feel ...,joy,im feeling more comfortable with derby i feel ...,2


In [None]:
X_test = test_df["words_processed"]
y_test = test_df["mood_processed"]

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.65      0.75       275
           1       0.87      0.69      0.77       224
           2       0.75      0.96      0.84       695
           3       0.86      0.42      0.56       159
           4       0.83      0.91      0.87       581
           5       1.00      0.18      0.31        66

    accuracy                           0.80      2000
   macro avg       0.87      0.64      0.68      2000
weighted avg       0.82      0.80      0.79      2000



In [None]:
roc_auc_score(y_test, clf.predict_proba(X_test), multi_class="ovr")

0.9816087276982727

In [None]:
input = "What? You can speak differently?"
clf.predict_proba([input])

array([[0.10473905, 0.11840997, 0.49688548, 0.08184327, 0.17560562,
        0.02251661]])

In [None]:
clf.predict_proba([input])[0]

array([0.08816032, 0.09800626, 0.14435178, 0.04891104, 0.59933906,
       0.02123154])

In [None]:
dict(zip(encoder.classes_, [x * 100 for x in clf.predict_proba([input])[0]]))

{'anger': 10.473904858695555,
 'fear': 11.840996602864807,
 'joy': 49.68854814317975,
 'love': 8.184327129310509,
 'sadness': 17.560561817892584,
 'surprise': 2.2516614480567934}