In [1]:
# from nltk.corpus import stopwords 
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer, SnowballStemmer


# from sklearn.svm import LinearSVC
# from sklearn.linear_model import SGDClassifier
# from sklearn.naive_bayes import BernoulliNB
# from sklearn.ensemble import RandomForestClassifier


# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier


# from sklearn.pipeline import Pipeline
# from sklearn import metrics
# from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
# from sklearn.cross_validation import KFold, cross_val_score


In [2]:
import pandas as pd    
import numpy as np
from scipy.sparse import hstack
from bs4 import BeautifulSoup  
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV



In [3]:
# Define function for text preprocessing
def text_cleaning(text):
    text = BeautifulSoup(text, "lxml").get_text()  # remove html tag
    text = re.sub(r'\@\w+',"", text)  # remove @tag 
#     text = re.sub(r'\#\w+',"", text)  
    text = re.sub("[^a-zA-Z]", " ", text)  # letters only
#     words = [e.lower() for e in text.split() if len(e) >= 3]
#     stops = set(stopwords.words("english"))
#     words = [w for w in words if not w in stops]  
#     words = word_tokenize(text)
#     stemmer = PorterStemmer()
#     stemmer = LancasterStemmer()
#     stemmer = SnowballStemmer("english")
#     words = [stemmer.stem(word) for word in text.split(" ")]
#     lemmatizer = WordNetLemmatizer()
#     words = [lemmatizer.lemmatize(word) for word in text.split(" ")]
#     text=" ".join(words) 
    return text

def text_preprocess(X):
    clean_X = [] 
    for ind, val in X.iteritems():
        clean_text = text_cleaning(val)
        clean_X.append(clean_text)
    return clean_X

In [4]:
# load data
print("Loading data ...")
df = pd.read_csv("sentiment.tsv", header=None, names=["sentiment","tweet"], delimiter="\t", quoting=2)
print("This data set contains %d observations" % df.shape[0])
print()

X = df["tweet"]
y = df["sentiment"]
y = preprocessing.LabelBinarizer().fit_transform(y)
c, r = y.shape
yvec = y.reshape(c,)


# text preprocessing
X_clean = text_preprocess(X)



Loading data ...
This data set contains 2001 observations



In [5]:
# create 3-dim feature vector using NLTK VADER Sentiment Intensity Analyzer
vader = SentimentIntensityAnalyzer()

def vader_polarity(text):
    score = vader.polarity_scores(text)
    feature_vec =[]
    feature_vec.append(score["neg"])
    feature_vec.append(score["neu"])
    feature_vec.append(score["pos"])
#     feature_vec.append(score["compound"])
    feature_vec = np.array(feature_vec)
    return feature_vec

Xvader = [vader_polarity(text) for text in X]


# tokenize the tweets into count vectors using CountVectorizer 
vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None,stop_words = None, max_features = 5000) 
Xcountvec = vectorizer.fit_transform(X_clean)


# normalize the count matrix to tf-idf representation using TfidfTransformer
tfidf = TfidfTransformer()
Xtfidf  = tfidf.fit_transform(Xcountvec)


# combining the two feature vectors 
Xvec = hstack((Xtfidf, Xvader))

In [9]:
# grid search on SVC to find the best parameter set and evaluate using 10-fold cross validation
print("Training SVC and finding the best parameter set ... ")
parameters = {"C" : [0.1,0.5,1,3,5], "kernel":["rbf","poly","linear", "sigmoid"]}   
grid_search = GridSearchCV(SVC(), parameters, cv=10, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(Xvec, yvec)
print()
print("Best parameters set :")
print(grid_search.best_params_)
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Grid scores :")
print()
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
print()

Training SVC and finding the best parameter set ... 
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.1min finished



Best parameters set :
{'C': 0.5, 'kernel': 'linear'}

Best score: 0.806
Grid scores :

0.768 (+/-0.065) for {'C': 0.1, 'kernel': 'rbf'}
0.765 (+/-0.061) for {'C': 0.1, 'kernel': 'poly'}
0.791 (+/-0.070) for {'C': 0.1, 'kernel': 'linear'}
0.768 (+/-0.065) for {'C': 0.1, 'kernel': 'sigmoid'}
0.768 (+/-0.066) for {'C': 0.5, 'kernel': 'rbf'}
0.765 (+/-0.061) for {'C': 0.5, 'kernel': 'poly'}
0.806 (+/-0.062) for {'C': 0.5, 'kernel': 'linear'}
0.768 (+/-0.066) for {'C': 0.5, 'kernel': 'sigmoid'}
0.768 (+/-0.065) for {'C': 1, 'kernel': 'rbf'}
0.765 (+/-0.061) for {'C': 1, 'kernel': 'poly'}
0.800 (+/-0.070) for {'C': 1, 'kernel': 'linear'}
0.768 (+/-0.066) for {'C': 1, 'kernel': 'sigmoid'}
0.768 (+/-0.065) for {'C': 3, 'kernel': 'rbf'}
0.765 (+/-0.061) for {'C': 3, 'kernel': 'poly'}
0.776 (+/-0.068) for {'C': 3, 'kernel': 'linear'}
0.768 (+/-0.065) for {'C': 3, 'kernel': 'sigmoid'}
0.768 (+/-0.065) for {'C': 5, 'kernel': 'rbf'}
0.765 (+/-0.061) for {'C': 5, 'kernel': 'poly'}
0.765 (+/-0.066) 

In [11]:
Xvec.shape

(2001, 4864)