In [1]:
import pandas as pd    
import numpy as np
from scipy.sparse import hstack
from bs4 import BeautifulSoup  
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



In [2]:
# Define function for text preprocessing
def text_cleaning(text):
    text = BeautifulSoup(text, "lxml").get_text()  # remove html tag
    text = re.sub(r'\@\w+',"", text)  # remove @tag 
    text = re.sub("[^a-zA-Z]", " ", text)  # letters only
    return text

def text_preprocess(X):
    clean_X = [] 
    for ind, val in X.iteritems():
        clean_text = text_cleaning(val)
        clean_X.append(clean_text)
    return clean_X


# load data
print("Loading data ...")
df = pd.read_csv("sentiment.tsv", header=None, names=["sentiment","tweet"], delimiter="\t", quoting=2)
print("This data set contains %d observations" % df.shape[0])
print()

X = df["tweet"]
y = df["sentiment"]
y = preprocessing.LabelBinarizer().fit_transform(y)
c, r = y.shape
yvec = y.reshape(c,)


# text preprocessing
X_clean = text_preprocess(X)


Loading data ...
This data set contains 2001 observations



In [3]:
# VADER + SVC 

# compute polarity score using VADER
print("****** VADER + SVC ******")
vader = SentimentIntensityAnalyzer()
def vader_polarity(text):
    score = vader.polarity_scores(text)
    feature_vec =[]
    feature_vec.append(score["neg"])
    feature_vec.append(score["neu"])
    feature_vec.append(score["pos"])
    feature_vec = np.array(feature_vec)
    return feature_vec
Xvec = [vader_polarity(text) for text in X]


# Grid search on SVC
parameters = {"C" : [0.1,0.5,1,3,5], "kernel":["rbf","poly","linear", "sigmoid"]}   
grid_search = GridSearchCV(SVC(), parameters, cv=10, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(Xvec, yvec)
print("Best parameters set :")
print(grid_search.best_params_)
print("Best score: %0.3f" % grid_search.best_score_)
print("Grid scores :")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()


****** VADER + SVC ******
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   16.7s


Best parameters set :
{'C': 0.1, 'kernel': 'poly'}
Best score: 0.747
Grid scores :
0.745 (+/-0.072) for {'C': 0.1, 'kernel': 'rbf'}
0.747 (+/-0.076) for {'C': 0.1, 'kernel': 'poly'}
0.745 (+/-0.075) for {'C': 0.1, 'kernel': 'linear'}
0.738 (+/-0.068) for {'C': 0.1, 'kernel': 'sigmoid'}
0.716 (+/-0.093) for {'C': 0.5, 'kernel': 'rbf'}
0.747 (+/-0.075) for {'C': 0.5, 'kernel': 'poly'}
0.676 (+/-0.131) for {'C': 0.5, 'kernel': 'linear'}
0.744 (+/-0.075) for {'C': 0.5, 'kernel': 'sigmoid'}
0.731 (+/-0.077) for {'C': 1, 'kernel': 'rbf'}
0.747 (+/-0.075) for {'C': 1, 'kernel': 'poly'}
0.663 (+/-0.099) for {'C': 1, 'kernel': 'linear'}
0.743 (+/-0.077) for {'C': 1, 'kernel': 'sigmoid'}
0.724 (+/-0.063) for {'C': 3, 'kernel': 'rbf'}
0.742 (+/-0.072) for {'C': 3, 'kernel': 'poly'}
0.671 (+/-0.128) for {'C': 3, 'kernel': 'linear'}
0.742 (+/-0.077) for {'C': 3, 'kernel': 'sigmoid'}
0.725 (+/-0.067) for {'C': 5, 'kernel': 'rbf'}
0.736 (+/-0.073) for {'C': 5, 'kernel': 'poly'}
0.681 (+/-0.131) for {

[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   17.3s finished


In [4]:
# Grid search on CountVectorizer + MultinomialNB

print("****** CountVectorizer + MultinomialNB ******")
clf = Pipeline([("vect", CountVectorizer()),("clf", MultinomialNB())])
parameters = {"vect__ngram_range" : ((1, 1), (1, 2)), # unigrams or bigrams
#     "vect__max_df": (0.5, 0.75, 1.0),
#     "vect__max_features": (None, 1000, 4000),
#     "vect__stop_words" : (None, "english"),
        "clf__alpha" : [0.1,1,3,10]
             }
grid_search = GridSearchCV(clf, parameters, cv=10, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(X_clean, yvec)
print("Best parameters set :")
print(grid_search.best_params_)
print("Best score: %0.3f" % grid_search.best_score_)
print("Grid scores :")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

****** CountVectorizer + MultinomialNB ******
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.4s


Best parameters set :
{'clf__alpha': 3, 'vect__ngram_range': (1, 1)}
Best score: 0.766
Grid scores :
0.731 (+/-0.088) for {'clf__alpha': 0.1, 'vect__ngram_range': (1, 1)}
0.721 (+/-0.073) for {'clf__alpha': 0.1, 'vect__ngram_range': (1, 2)}
0.762 (+/-0.073) for {'clf__alpha': 1, 'vect__ngram_range': (1, 1)}
0.754 (+/-0.067) for {'clf__alpha': 1, 'vect__ngram_range': (1, 2)}
0.766 (+/-0.071) for {'clf__alpha': 3, 'vect__ngram_range': (1, 1)}
0.761 (+/-0.068) for {'clf__alpha': 3, 'vect__ngram_range': (1, 2)}
0.760 (+/-0.069) for {'clf__alpha': 10, 'vect__ngram_range': (1, 1)}
0.756 (+/-0.071) for {'clf__alpha': 10, 'vect__ngram_range': (1, 2)}


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    6.5s finished


In [5]:
# Grid search on CountVectorizer + TfidfTransformer + MultinomialNB

print("****** CountVectorizer + TfidfTransformer + MultinomialNB ******")
clf = Pipeline([("vect", CountVectorizer()),('tfidf', TfidfTransformer()), ("clf", MultinomialNB())])  
parameters = {"vect__ngram_range" : ((1, 1), (1, 2)), # unigrams or bigrams
#     "vect__max_df": (0.5, 0.75, 1.0),
#     "vect__max_features": (None, 1000, 4000),
#     "vect__stop_words" : (None, "english"),
        "clf__alpha" : [0.1,1,3,10]
             }
grid_search = GridSearchCV(clf, parameters, cv=10, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(X_clean, yvec)
print("Best parameters set :")
print(grid_search.best_params_)
print("Best score: %0.3f" % grid_search.best_score_)
print("Grid scores :")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

****** CountVectorizer + TfidfTransformer + MultinomialNB ******
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.4s


Best parameters set :
{'clf__alpha': 3, 'vect__ngram_range': (1, 1)}
Best score: 0.779
Grid scores :
0.742 (+/-0.083) for {'clf__alpha': 0.1, 'vect__ngram_range': (1, 1)}
0.741 (+/-0.073) for {'clf__alpha': 0.1, 'vect__ngram_range': (1, 2)}
0.775 (+/-0.063) for {'clf__alpha': 1, 'vect__ngram_range': (1, 1)}
0.772 (+/-0.062) for {'clf__alpha': 1, 'vect__ngram_range': (1, 2)}
0.779 (+/-0.057) for {'clf__alpha': 3, 'vect__ngram_range': (1, 1)}
0.776 (+/-0.055) for {'clf__alpha': 3, 'vect__ngram_range': (1, 2)}
0.776 (+/-0.052) for {'clf__alpha': 10, 'vect__ngram_range': (1, 1)}
0.771 (+/-0.051) for {'clf__alpha': 10, 'vect__ngram_range': (1, 2)}


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    6.5s finished
