In [1]:
import pandas as pd

df = pd.read_csv("data/sentiment140-subset.csv", nrows=30000)
df.head()

Unnamed: 0,polarity,text
0,0,@kconsidder You never tweet
1,0,Sick today coding from the couch.
2,1,"@ChargerJenn Thx for answering so quick,I was ..."
3,1,Wii fit says I've lost 10 pounds since last ti...
4,0,@MrKinetik Not a thing!!! I don't really have...


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [3]:
vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(df.text)
words_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
words_df.head()
joblib.dump(vectorizer, 'trained_models/DfFittedVectorizer.sav')

['DfFittedVectorizer.sav']

In [4]:
X = words_df
y = df.polarity

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import joblib

In [6]:
%%time
# Create and train a logistic regression
logreg = LogisticRegression(C=1e9, solver='lbfgs', max_iter=1000)
logreg.fit(X, y)

CPU times: user 33.3 s, sys: 3.53 s, total: 36.8 s
Wall time: 11.2 s


LogisticRegression(C=1000000000.0, max_iter=1000)

In [7]:
# Export trained model
joblib.dump(logreg, "trained_models/LogRegForSentimentAnalysis.sav")

['trained_models/LogRegForSentimentAnalysis.sav']

In [8]:
%%time
# Create and train a multinomial naive bayes classifier (MultinomialNB)
bayes = MultinomialNB()
bayes.fit(X, y)

CPU times: user 516 ms, sys: 178 ms, total: 694 ms
Wall time: 374 ms


MultinomialNB()

In [9]:
# Export trained model
joblib.dump(bayes, "trained_models/NaiBayesForSentimentAnalysis.sav")

['trained_models/NaiBayesForSentimentAnalysis.sav']

In [24]:
# Create some test data

pd.set_option("display.max_colwidth", 200)

unknown = pd.DataFrame({'content': [
    "I dont know what to think about it",
    "That was fucking awesome dawg!",
    "Goddamn what a miracle!",
    "Son of a bitch!"
]})
unknown

Unnamed: 0,content
0,I dont know what to think about it
1,That was fucking awesome dawg!
2,Goddamn what a miracle!
3,Son of a bitch!


In [11]:
print(vectorizer.get_feature_names())

['10', '100', '11', '12', '15', '1st', '20', '2day', '2nd', '30', 'able', 'about', 'account', 'actually', 'add', 'after', 'afternoon', 'again', 'ago', 'agree', 'ah', 'ahh', 'ahhh', 'air', 'album', 'all', 'almost', 'alone', 'already', 'alright', 'also', 'although', 'always', 'am', 'amazing', 'amp', 'an', 'and', 'annoying', 'another', 'any', 'anymore', 'anyone', 'anything', 'anyway', 'app', 'apparently', 'apple', 'appreciate', 'are', 'around', 'art', 'as', 'ask', 'asleep', 'ass', 'at', 'ate', 'aw', 'awake', 'awards', 'away', 'awesome', 'aww', 'awww', 'baby', 'back', 'bad', 'band', 'bbq', 'bday', 'be', 'beach', 'beautiful', 'because', 'bed', 'been', 'beer', 'before', 'behind', 'being', 'believe', 'best', 'bet', 'better', 'big', 'bike', 'birthday', 'bit', 'bitch', 'black', 'blip', 'blog', 'blue', 'body', 'boo', 'book', 'books', 'bored', 'boring', 'both', 'bought', 'bout', 'box', 'boy', 'boys', 'break', 'breakfast', 'bring', 'bro', 'broke', 'broken', 'brother', 'brothers', 'btw', 'bus', 'bu

In [25]:
# Put it through the vectoriser

# transform, not fit_transform, because we already learned all our words
unknown_vectors = vectorizer.transform(unknown.content)
unknown_words_df = pd.DataFrame(unknown_vectors.toarray(), columns=vectorizer.get_feature_names())
unknown_words_df.head()

Unnamed: 0,10,100,11,12,15,1st,20,2day,2nd,30,...,yesterday,yet,yo,you,young,your,yourself,youtube,yum,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
%%time
# Predict using all our models. 

# Logistic Regression predictions + probabilities
unknown['pred_logreg'] = logreg.predict(unknown_words_df)
unknown['pred_logreg_proba'] = logreg.predict_proba(unknown_words_df)[:,1]

# Bayes predictions + probabilities
unknown['pred_bayes'] = bayes.predict(unknown_words_df)
unknown['pred_bayes_proba'] = bayes.predict_proba(unknown_words_df)[:,1]

CPU times: user 87.9 ms, sys: 3.47 ms, total: 91.4 ms
Wall time: 103 ms


In [27]:
unknown

Unnamed: 0,content,pred_logreg,pred_logreg_proba,pred_bayes,pred_bayes_proba
0,I dont know what to think about it,0,0.326024,0,0.420156
1,That was fucking awesome dawg!,1,0.685195,1,0.545361
2,Goddamn what a miracle!,1,0.509478,1,0.507385
3,Son of a bitch!,1,0.509502,0,0.404694
