### Baseline Models - Naive Bayes and Logistic Regression
Sentiment Pipelines
* Count Vectorizer
* TF-IDF Transformer
* MultinomialNB/LogisticRegression Model

Out-of-Sample Performance (Macro Avg F1-Score)
* MultinomialNB - 0.5272
* LogisticRegression - 0.5371

1-Month Comments Inference Time
* MultinomialNB - 0.5341 seconds
* LogisticRegression - 0.6245 seconds

In [None]:
import pandas as pd
df = pd.read_csv('tweets_comments_combined_df.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4300 entries, 0 to 4299
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    4300 non-null   object
 1   sentiment  4300 non-null   object
dtypes: object(2)
memory usage: 67.3+ KB


In [None]:
df['sentiment'].value_counts()

positive    1650
neutral     1337
negative    1313
Name: sentiment, dtype: int64

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

df = pd.read_csv('tweets_comments_combined_df.csv')
df['comment'] = df['comment'].str.strip().str.lower()

X = df['comment']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

from sklearn.metrics import classification_report
clf = GridSearchCV(text_clf, tuned_parameters, cv=10)
clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test), digits=4))

              precision    recall  f1-score   support

    negative     0.5502    0.5045    0.5263       337
     neutral     0.5133    0.3602    0.4234       322
    positive     0.5593    0.7260    0.6318       416

    accuracy                         0.5470      1075
   macro avg     0.5409    0.5302    0.5272      1075
weighted avg     0.5426    0.5470    0.5363      1075



In [None]:
#1-month Inference
inference = pd.read_csv('inference_1month_comments.csv')
inference.shape

(18676, 2)

In [None]:
import time
t1 = time.perf_counter()

clf.predict(inference['comment'])

t2 = time.perf_counter()
print('time taken to run:',t2-t1)

time taken to run: 0.5341051159999779


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.stats import loguniform

df = pd.read_csv('tweets_comments_combined_df.csv')
df['comment'] = df['comment'].str.strip().str.lower()

X = df['comment']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 

log_regtext_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(solver='liblinear'))])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': [1e-5, 1e-3, 1, 10, 100]
}

from sklearn.metrics import classification_report
logreg_clf = GridSearchCV(log_regtext_clf, tuned_parameters, cv=10)
logreg_clf.fit(X_train, y_train)

print(classification_report(y_test, logreg_clf.predict(X_test), digits=4))

              precision    recall  f1-score   support

    negative     0.5439    0.4777    0.5087       337
     neutral     0.5139    0.4596    0.4852       322
    positive     0.5703    0.6731    0.6174       416

    accuracy                         0.5479      1075
   macro avg     0.5427    0.5368    0.5371      1075
weighted avg     0.5451    0.5479    0.5437      1075



In [None]:
import time
t1 = time.perf_counter()

logreg_clf.predict(inference['comment'])

t2 = time.perf_counter()
print('time taken to run:',t2-t1)

time taken to run: 0.6244980620000433
