In [1]:
# primary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# text processing libraries
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Misc
from sklearn.metrics import roc_auc_score

In [2]:
X_train = pd.read_csv("./Dataset/cleaned_training.csv")
X_test = pd.read_csv("./Dataset/cleaned_test.csv")

X_train.drop("Unnamed: 0", axis = 1, inplace = True)
y_train = X_train['sentiment']
X_train.drop('sentiment', axis = True, inplace = True)

X_test.drop(['Unnamed: 0', "company"], axis = 1, inplace = True)
y_test = X_test['sentiment']
X_test.drop("sentiment", axis = 1, inplace = True)

X_train.head()

Unnamed: 0,tweets
0,im get on borderland and i will murder you all...
1,the biggest dissappoin in my life came out a y...
2,rockhard la varlop rare power handsom jackpot ...
3,appreci the sonic concept praxi valenzuela and...


In [3]:
# encode target and company names
sentiment_encoder = LabelEncoder()
y_train = sentiment_encoder.fit_transform(y_train)
y_test = sentiment_encoder.transform(y_test)

y_train, sentiment_encoder.classes_

(array([3, 1, 2, 0]),
 array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object))

### TFIDF (Implementation)
* Scikit-learn has a variety of parameters. For more, click [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
* sublinear_tf replaces tf with 1 + log(tf) which proves to be important for scaling the values.
* Its output is of the format (A, B) C where A is the document index, B is the word vector index and C is the tfidf score.
* Use .toarray() function to obtain only the tfidf scores.
* As for the words that appear in training data but not in test data, sklearn simply doesn't include the tdidf scores for such words. 
* Similary for words that appear in test data but not in training data, sklearn assigns tfidf score to such words as 0.

In [4]:
# tfidf vectorizer
# return format is (A, B) C where A is the document index, B is the word vector index and C is the tfidf score
temp_arr = []
for document in X_train['tweets']:
    temp_arr.append(document)
    
vectorizer = TfidfVectorizer(min_df = 0.05, sublinear_tf = True)
tfidf_scores_train = vectorizer.fit_transform(X_train['tweets'])
tfidf_scores_test = vectorizer.transform(X_test['cleaned_tweets'])

In [5]:
model = MultinomialNB()
model.fit(tfidf_scores_train, y_train)
preds = model.predict_proba(tfidf_scores_test)
preds

array([[0.25363334, 0.24970555, 0.24690678, 0.24975432],
       [0.24784358, 0.25100818, 0.25151135, 0.24963689],
       [0.25037815, 0.25162096, 0.24877448, 0.24922641],
       ...,
       [0.25026768, 0.25037421, 0.24763634, 0.25172178],
       [0.24965188, 0.25009444, 0.24863578, 0.2516179 ],
       [0.24699647, 0.25175029, 0.25362663, 0.24762661]])

In [6]:
score = roc_auc_score(y_test, preds, multi_class='ovo')
score

0.898402468062414