In [1]:
import pandas as pd

df = pd.read_csv("data/sentiment140-subset.csv", nrows=500000)
df.head()

Unnamed: 0,polarity,text
0,0,@kconsidder You never tweet
1,0,Sick today coding from the couch.
2,1,"@ChargerJenn Thx for answering so quick,I was ..."
3,1,Wii fit says I've lost 10 pounds since last ti...
4,0,@MrKinetik Not a thing!!! I don't really have...


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [3]:
vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(df.text)
words_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())
words_df.head()
joblib.dump(vectorizer, 'trained_models/DfFittedVectorizer.sav')

['trained_models/DfFittedVectorizer.sav']

In [4]:
X = words_df
y = df.polarity

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import joblib

In [6]:
%%time
# Create and train a logistic regression
logreg = LogisticRegression(C=1e9, solver='lbfgs', max_iter=1000)
logreg.fit(X, y)

CPU times: user 42.6 s, sys: 0 ns, total: 42.6 s
Wall time: 42.6 s


LogisticRegression(C=1000000000.0, max_iter=1000)

In [7]:
# Export trained model
joblib.dump(logreg, "trained_models/LogRegForSentimentAnalysis.sav")

['trained_models/LogRegForSentimentAnalysis.sav']

In [8]:
%%time
# Create and train a multinomial naive bayes classifier (MultinomialNB)
bayes = MultinomialNB()
bayes.fit(X, y)

CPU times: user 923 ms, sys: 108 µs, total: 923 ms
Wall time: 921 ms


MultinomialNB()

In [9]:
# Export trained model
joblib.dump(bayes, "trained_models/NaiBayesForSentimentAnalysis.sav")

['trained_models/NaiBayesForSentimentAnalysis.sav']

In [10]:
# Create some test data

pd.set_option("display.max_colwidth", 200)

unknown = pd.DataFrame({'content': [
    "I dont know what to think about it",
    "That was fucking awesome dawg!",
    "Goddamn what a miracle!",
    "Son of a bitch!"
]})
unknown

Unnamed: 0,content
0,I dont know what to think about it
1,That was fucking awesome dawg!
2,Goddamn what a miracle!
3,Son of a bitch!


In [11]:
print(vectorizer.get_feature_names_out())

['10' '100' '11' '12' '15' '1st' '20' '2day' '2nd' '30' 'able' 'about'
 'absolutely' 'account' 'actually' 'add' 'after' 'afternoon' 'again' 'ago'
 'agree' 'ah' 'ahh' 'ahhh' 'air' 'airport' 'album' 'all' 'almost' 'alone'
 'along' 'alot' 'already' 'alright' 'also' 'although' 'always' 'am'
 'amazing' 'amp' 'an' 'and' 'annoying' 'another' 'answer' 'any' 'anymore'
 'anyone' 'anything' 'anyway' 'app' 'apparently' 'apple' 'are' 'aren'
 'around' 'as' 'ask' 'asleep' 'ass' 'at' 'ate' 'aw' 'awake' 'awards'
 'away' 'awesome' 'aww' 'awww' 'babe' 'baby' 'back' 'bad' 'band' 'bbq'
 'bday' 'be' 'beach' 'beat' 'beautiful' 'because' 'bed' 'been' 'beer'
 'before' 'behind' 'being' 'believe' 'best' 'bet' 'better' 'big' 'bike'
 'birthday' 'bit' 'black' 'blip' 'blog' 'blue' 'body' 'boo' 'book' 'books'
 'bored' 'boring' 'both' 'bought' 'bout' 'box' 'boy' 'boyfriend' 'boys'
 'break' 'breakfast' 'bring' 'bro' 'broke' 'broken' 'brother' 'brothers'
 'btw' 'bus' 'business' 'busy' 'but' 'buy' 'by' 'bye' 'cake' 'call

In [12]:
# Put it through the vectoriser

# transform, not fit_transform, because we already learned all our words
unknown_vectors = vectorizer.transform(unknown.content)
unknown_words_df = pd.DataFrame(unknown_vectors.toarray(), columns=vectorizer.get_feature_names_out())
unknown_words_df.head()



Unnamed: 0,10,100,11,12,15,1st,20,2day,2nd,30,...,yet,yo,you,your,yours,yourself,youtube,yum,yummy,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
%%time
# Predict using all our models. 

# Logistic Regression predictions + probabilities
unknown['pred_logreg'] = logreg.predict(unknown_words_df)
unknown['pred_logreg_proba'] = logreg.predict_proba(unknown_words_df)[:,1]

# Bayes predictions + probabilities
unknown['pred_bayes'] = bayes.predict(unknown_words_df)
unknown['pred_bayes_proba'] = bayes.predict_proba(unknown_words_df)[:,1]

CPU times: user 18.9 ms, sys: 0 ns, total: 18.9 ms
Wall time: 18.4 ms


In [14]:
unknown

Unnamed: 0,content,pred_logreg,pred_logreg_proba,pred_bayes,pred_bayes_proba
0,I dont know what to think about it,0,0.274998,0,0.382279
1,That was fucking awesome dawg!,1,0.745143,1,0.548629
2,Goddamn what a miracle!,1,0.589161,1,0.51605
3,Son of a bitch!,1,0.672329,1,0.548168
