In [1]:
import pandas as pd    
import numpy as np
from scipy.sparse import hstack
from bs4 import BeautifulSoup  
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV



In [2]:
# Define function for text preprocessing
def text_cleaning(text):
    text = BeautifulSoup(text, "lxml").get_text()  # remove html tag
    text = re.sub(r'\@\w+',"", text)  # remove @tag 
    text = re.sub("[^a-zA-Z]", " ", text)  # letters only
    return text

def text_preprocess(X):
    clean_X = [] 
    for ind, val in X.iteritems():
        clean_text = text_cleaning(val)
        clean_X.append(clean_text)
    return clean_X


# load data
print("Loading data ...")
df = pd.read_csv("sentiment.tsv", header=None, names=["sentiment","tweet"], delimiter="\t", quoting=2)
print("This data set contains %d observations" % df.shape[0])
print()

X = df["tweet"]
y = df["sentiment"]
y = preprocessing.LabelBinarizer().fit_transform(y)
c, r = y.shape
yvec = y.reshape(c,)


# text preprocessing
X_clean = text_preprocess(X)


Loading data ...
This data set contains 2001 observations



In [3]:
# create 3-dim feature vector using NLTK VADER Sentiment Intensity Analyzer
vader = SentimentIntensityAnalyzer()

def vader_polarity(text):
    score = vader.polarity_scores(text)
    feature_vec =[]
    feature_vec.append(score["neg"])
    feature_vec.append(score["neu"])
    feature_vec.append(score["pos"])
#     feature_vec.append(score["compound"])
    feature_vec = np.array(feature_vec)
    return feature_vec

Xvader = [vader_polarity(text) for text in X]


# tokenize the tweets into count vectors using CountVectorizer 
vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None, max_features = 5000) 
Xcountvec = vectorizer.fit_transform(X_clean)


# normalize the count matrix to tf-idf representation using TfidfTransformer
tfidf = TfidfTransformer()
Xtfidf  = tfidf.fit_transform(Xcountvec)


# combining the two feature vectors 
Xvec = hstack((Xtfidf, Xvader))

In [4]:
print("VADER + CountVectorizer + TfidfTransformer + LinearSVC")
parameters = {"C" : [0.1,0.5,1,3,5]}    
grid_search = GridSearchCV(LinearSVC(), parameters, cv=10, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(Xvec, yvec)
print("Best parameters set :")
print(grid_search.best_params_)
print("Best score: %0.3f" % grid_search.best_score_)
print("Grid scores :")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()


print("VADER + CountVectorizer + TfidfTransformer + SGDClassifier  ")
parameters = {"alpha" : [1e-5,1e-4,1e-3,1e-2,0.1]}          
grid_search = GridSearchCV(SGDClassifier(), parameters, cv=10, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(Xvec, yvec)
print("Best parameters set :")
print(grid_search.best_params_)
print("Best score: %0.3f" % grid_search.best_score_)
print("Grid scores :")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()


print("VADER + CountVectorizer + TfidfTransformer + RandomForestClassifier *")
parameters = {"n_estimators": [100,500,1000],
#               "criterion": ["gini", "entropy"],
              "max_depth": [10, None],
#               "min_samples_split": sp_randint(1, 11),
#               "min_samples_leaf": sp_randint(1, 11),
#               "max_features": ["sqrt", "log2", None]
#               "min_impurity_split": [1e-07],
#               "bootstrap": [True, False],            
              }
print("Performing grid search...")
print("Parameters:")
print(parameters)
grid_search = GridSearchCV(RandomForestClassifier(), parameters, cv=10, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(Xvec, yvec)
print("Best parameters set :")
print(grid_search.best_params_)
print("Best score: %0.3f" % grid_search.best_score_)
print("Grid scores :")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))


VADER + CountVectorizer + TfidfTransformer + LinearSVC
Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best parameters set :
{'C': 0.1}
Best score: 0.804
Grid scores :
0.804 (+/-0.059) for {'C': 0.1}
0.798 (+/-0.064) for {'C': 0.5}
0.790 (+/-0.066) for {'C': 1}
0.775 (+/-0.069) for {'C': 3}
0.768 (+/-0.070) for {'C': 5}

VADER + CountVectorizer + TfidfTransformer + SGDClassifier  
Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.4s finished


Best parameters set :
{'alpha': 0.001}
Best score: 0.806
Grid scores :
0.771 (+/-0.071) for {'alpha': 1e-05}
0.776 (+/-0.068) for {'alpha': 0.0001}
0.806 (+/-0.062) for {'alpha': 0.001}
0.778 (+/-0.067) for {'alpha': 0.01}
0.766 (+/-0.065) for {'alpha': 0.1}

VADER + CountVectorizer + TfidfTransformer + RandomForestClassifier *
Performing grid search...
Parameters:
{'n_estimators': [100, 500, 1000], 'max_depth': [10, None]}
Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.2min finished


Best parameters set :
{'max_depth': None, 'n_estimators': 1000}
Best score: 0.795
Grid scores :
0.781 (+/-0.053) for {'max_depth': 10, 'n_estimators': 100}
0.791 (+/-0.065) for {'max_depth': 10, 'n_estimators': 500}
0.791 (+/-0.061) for {'max_depth': 10, 'n_estimators': 1000}
0.792 (+/-0.060) for {'max_depth': None, 'n_estimators': 100}
0.795 (+/-0.060) for {'max_depth': None, 'n_estimators': 500}
0.795 (+/-0.062) for {'max_depth': None, 'n_estimators': 1000}
