# A simple sentiment prototype

In [1]:
import os  # manipulate paths
import pandas as pd  # SQL-like operations and convenience functions
import joblib  # save and load models
import numpy as np

Download the Sentiment140 data from [their website](http://help.sentiment140.com/for-students) and set `DATA_DIR` to the directory in which you have put the `CSV` files.

In [2]:
DATA_DIR = "data"
MODEL_DIR = "models"

In [3]:
training_csv_file = os.path.join(DATA_DIR, 'training.1600000.processed.noemoticon.csv')

## A peek at the data

In [4]:
names = ('polarity', 'id', 'date', 'query', 'author', 'text')
df = pd.read_csv(training_csv_file, encoding='latin1', names=names)

In [5]:
pd.options.display.max_colwidth = 140  # allow wide columns
df.head()  # show first 5 rows

Unnamed: 0,polarity,id,date,query,author,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [6]:
set(df.polarity)

{0, 4}

In [7]:
df['polarity'].replace({0: -1, 4: 1}, inplace=True)
text = df['text']
target = df['polarity'].values

In [8]:
print(len(target), len(text))

1600000 1600000


In [9]:
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer

In [10]:
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [11]:
cleaned_text = []
for t in text:
    cleaned_text.append(tweet_cleaner(t))

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


In [12]:
len(cleaned_text)

1600000

In [13]:
len(target)

1600000

## Train the model

Set 20% of the data aside to test the trained model

In [15]:
from sklearn.model_selection import train_test_split

text_train, text_validation, target_train, target_validation = (
    train_test_split(text, target, test_size=0.2, random_state=128)
)

Build a pipeline

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

vectorizer = CountVectorizer(ngram_range=(1, 4), max_features=120000)
feature_selector = SelectKBest(chi2, k=5000)
classifier = LogisticRegressionCV(n_jobs=-1)
# classifier = MultinomialNB()

This next cell took ~3 minutes to run on my machine

In [32]:
sentiment_pipeline = Pipeline((
    ('v', vectorizer),
    ('f', feature_selector),
    ('c', classifier)
))
sentiment_pipeline.fit(text_train, target_train)



Pipeline(memory=None,
     steps=[('v', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=120000, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

In [33]:
joblib.dump(sentiment_pipeline, os.path.join(MODEL_DIR,'model.pkl'))

['models\\model.pkl']

## Test the model

In [20]:
if os.path.exists(os.path.join(MODEL_DIR,'model.pkl')):
    sentiment_pipeline = joblib.load(os.path.join(MODEL_DIR,'model.pkl'))
else:
    print("Model Not Found")

In [34]:
print(sentiment_pipeline.predict(['bad', 'good', "didnt like", "today was a good day", "i hate this product"]))

[-1  1 -1  1 -1]


In [35]:
for pred_text, pred_target in zip(text_validation[:10], target_validation[:10]):
    print(sentiment_pipeline.predict([pred_text])[0], pred_target, '\t', pred_text)

1 -1 	 @ayatoshirosan ...later to say that you just couldn't fall asleep.  &lt;3 &lt;3 &lt;3 So I hope you have now managed to do so, Darling, and that..
-1 1 	 i miss u too, dad.. 
-1 -1 	 Earthquakes to Host FC Barcelona on Saturday, August 8 at Candlestick - Unfortunately it likely conflicts with Mustang Tournament 
1 1 	 @TotallyQueer81 Hehe. Well I can't figure anything out so im off to bed. Have a good one and ttyl. 
1 -1 	 freaking out!!!! pray for my grandma  let her be okay I love my vavo &lt;3: freaking out!!!! pray for my grand.. http://bit.ly/gHMhB
-1 -1 	 @timseppala They said &quot;next year&quot; so not this fall. No specific release date yet  Still can't wait though!
1 1 	 It's awesome that all those who participated in my 100 songs meme are doing their own thing. No 2 blog posts have the exact same premise. 
-1 1 	 @aravindkumar yeah sadly yes we have power 
-1 -1 	 finally got the nerve to call her ex shes sooo pathetic her heart was going so fast she was ready to sur

In [36]:
sentiment_pipeline.score(text_validation, target_validation)

0.796696875

## What did the model learn?

In [37]:
feature_names = sentiment_pipeline.steps[0][1].get_feature_names()
feature_names = [feature_names[i] for i in 
                 sentiment_pipeline.steps[1][1].get_support(indices=True)]

def show_most_informative_features(feature_names, clf, n=1000):
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [38]:
show_most_informative_features(feature_names, sentiment_pipeline.steps[2][1], n=50)

	-3.5105	not happy      		2.6210	no problem     
	-3.2820	clean me       		2.5297	no worries     
	-2.9564	passed away    		2.4885	cannot wait    
	-2.9365	inaperfectworld		2.4812	cant wait      
	-2.8835	sad            		2.2500	smiling        
	-2.7266	disappointing  		2.2291	not bad        
	-2.6581	not nice       		2.1047	no prob        
	-2.6330	not cool       		1.9884	congratulations
	-2.6281	no luck        		1.9640	sad sad        
	-2.6106	sadly          		1.7446	welcome        
	-2.6084	gutted         		1.7393	woooo          
	-2.5913	heartbroken    		1.7078	just sayin     
	-2.5290	condolences    		1.6788	smile          
	-2.4938	what wrong     		1.6287	yayyy          
	-2.4843	rip            		1.6099	same to you    
	-2.4540	bummed         		1.6081	thankyou       
	-2.4465	boohoo         		1.6078	proud          
	-2.3986	heartbreaking  		1.6062	blessings      
	-2.3905	saddened       		1.5804	followfriday   
	-2.3898	depressed      		1.5699	my pleasure    
	-2.3844	not fun    