In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup  
import re
import nltk
import nltk.data
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from gensim.models import word2vec
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV

In [2]:
# Define a function for text preprocessing
def text_cleaning(text, remove_stopwords=False):
    text = BeautifulSoup(text, "lxml").get_text() 
    text = re.sub(r'\@\w+',"", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    words = text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)


# Define a function to split a tweet into parsed sentences using NLTK's punkt tokenizer
# note: Word2Vec take a list of sentences as input
def tweet_to_sentences(tweet, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(tweet.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(text_cleaning(raw_sentence,remove_stopwords))
    return sentences


# Define functions to compute average feature vector
def makeFeatureVec(tweet, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.wv.index2word) #index2word is the volcabulary list of the Word2Vec model
    for word in tweet:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def getAvgFeatureVecs(tweets, model, num_features):
    counter = 0.
    featureVecs = np.zeros((len(tweets),num_features),dtype="float32")
    for tweet in tweets:
       featureVecs[counter] = makeFeatureVec(tweet, model,num_features)
       counter = counter + 1.
    return featureVecs


In [10]:
# Load data
print("Loading data ...")
df = pd.read_csv("sentiment.tsv", header=None, names=["sentiment","tweet"], delimiter="\t", quoting=2)
print("This data set contains %d observations" % df.shape[0])
print()

X = df["tweet"]
y = df["sentiment"]
y = preprocessing.LabelBinarizer().fit_transform(y)
c, r = y.shape
yvec = y.reshape(c,)


# Parse tweets into sentences
print("Parsing sentences ...")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = []
for tweet in X:
    sentences += tweet_to_sentences(tweet, tokenizer)
# print(type(sentences), len(sentences), sentences[0:10])
print()


# Train Word2Vec
num_features = 100                       
min_word_count = 10                         
num_workers = 4       
context = 10                                                                                          
downsampling = 1e-3 
print("Training Word2Vec model ...\n")
model = Word2Vec(sentences, workers=num_workers,size=num_features, min_count = min_word_count,\
                 window = context, sample = downsampling)
model.init_sims(replace=True)
model.save("300features_40minwords_10context")
# model = Word2Vec.load("300features_40minwords_10context")
# print(type(model.wv.syn0), len(model.wv.syn0), model.wv.syn0[0])
# print(type(model.wv.index2word), len(model.wv.index2word))
# print(model.wv.index2word)
print()


# Compute average feature vectors
print("Creating average feature vectors ...\n")
clean_X = []
for tweet in X:
    clean_X.append(text_cleaning(tweet, remove_stopwords=True))
Xvec = getAvgFeatureVecs(clean_X, model, num_features)
print()


# Check if Xvec contains any nan or infinite value  
print("Any nan or infinite values of feature vectors?")
print(np.any(np.isnan(Xvec))) #true
print(np.all(np.isfinite(Xvec))) #false


# Impute nan or infinite value
Xvec[np.isnan(Xvec)] = np.median(Xvec[~np.isnan(Xvec)])

Loading data ...
This data set contains 2001 observations

Parsing sentences ...


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup



Training Word2Vec model ...


Creating average feature vectors ...


Any nan or infinite values of feature vectors? 

True
False




In [8]:
# Grid Search on Random Forest
print("Training Random Forest and finding the best parameter set ... ")
parameters_RF = {"n_estimators": [10, 100],
#               "criterion": ["gini", "entropy"],
              "max_depth": [10, None],
#               "min_samples_split": sp_randint(1, 11),
#               "min_samples_leaf": sp_randint(1, 11),
#               "max_features": ["sqrt", "log2", None]
#               "min_impurity_split": [1e-07],
#               "bootstrap": [True, False],            
              }

print("Performing grid search...")
print()
grid_search = GridSearchCV(RandomForestClassifier(), parameters_RF, cv=10, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(Xvec, yvec)
print("Best parameters set :")
print(grid_search.best_params_)
print("Best score: %0.3f" % grid_search.best_score_)
print("Grid scores :")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))

Training Random Forest and finding the best parameter set ... 
Performing grid search...

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   11.2s finished


Best parameters set :
{'max_depth': 10, 'n_estimators': 100}
Best score: 0.661
Grid scores :
0.597 (+/-0.095) for {'max_depth': 10, 'n_estimators': 10}
0.661 (+/-0.122) for {'max_depth': 10, 'n_estimators': 100}
0.584 (+/-0.070) for {'max_depth': None, 'n_estimators': 10}
0.648 (+/-0.111) for {'max_depth': None, 'n_estimators': 100}
