In [1]:
import json
import re
import nltk

In [2]:
nltk.download('words')
nltk.download('omw-1.4')
nltk.download('sentiwordnet')
nltk.download('word2vec_sample')
nltk.download('opinion_lexicon')

[nltk_data] Error loading words: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading omw-1.4: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading sentiwordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading word2vec_sample: <urlopen error [Errno
[nltk_data]     11001] getaddrinfo failed>
[nltk_data] Error loading opinion_lexicon: <urlopen error [Errno
[nltk_data]     11001] getaddrinfo failed>


False

In [3]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [4]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
dictionary = set(nltk.corpus.words.words()) #To be used for MaxMatch

#Function to lemmatize word | Used during maxmatch
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

#Function to implement the maxmatch algorithm for multi-word hashtags
def maxmatch(word,dictionary):
    if not word:
        return []
    for i in range(len(word),1,-1):
        first = word[0:i]
        rem = word[i:]
        if lemmatize(first).lower() in dictionary: #Important to lowercase lemmatized words before comparing in dictionary. 
            return [first] + maxmatch(rem,dictionary)
    first = word[0:1]
    rem = word[1:]
    return [first] + maxmatch(rem,dictionary)

#Function to preprocess a single tweet
def preprocess(tweet):
    
    tweet = re.sub("@\w+","",tweet).strip()
    tweet = re.sub("http\S+","",tweet).strip()
    hashtags = re.findall("#\w+",tweet)
    
    tweet = tweet.lower()
    tweet = re.sub("#\w+","",tweet).strip() 
    
    hashtag_tokens = [] #Separate list for hashtags
    
    for hashtag in hashtags:
        hashtag_tokens.append(maxmatch(hashtag[1:],dictionary))        
    
    segmenter = nltk.data.load('tokenizers/punkt/english.pickle')
    segmented_sentences = segmenter.tokenize(tweet)
    
    #General tokenization
    processed_tweet = []
    
    word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
    for sentence in segmented_sentences:
        tokenized_sentence = word_tokenizer.tokenize(sentence.strip())
        processed_tweet.append(tokenized_sentence)
    
    #Processing the hashtags only when they exist in a tweet
    if hashtag_tokens:
        for tag_token in hashtag_tokens:
            processed_tweet.append(tag_token)
    
    return processed_tweet

    
#Custom function that takes in a file, and passes each tweet to the preprocessor
def preprocess_file(filename):
    tweets = []
    labels = []
    f = open(filename)
    for line in f:
        tweet_dict = json.loads(line)
        tweets.append(preprocess(tweet_dict["text"]))
        labels.append(int(tweet_dict["label"]))
    return tweets,labels

In [5]:
maxmatch('wecan',dictionary)

['we', 'can']

In [6]:
maxmatch('casestudy',dictionary)

['cases', 'tu', 'd', 'y']

In [7]:
#Running the basic preprocessing module and capturing the data (maybe shift to the next block)
train_data = preprocess_file('training.json')
train_tweets = train_data[0]
train_labels = train_data[1]

In [8]:
print(train_tweets[:2])

[[['dear', 'the', 'newooffice', 'for', 'mac', 'is', 'great', 'and', 'all', ',', 'but', 'no', 'lync', 'update', '?'], ['c', "'", 'mon', '.']], [['how', 'about', 'you', 'make', 'a', 'system', 'that', 'doesn', "'", 't', 'eat', 'my', 'friggin', 'discs', '.'], ['this', 'is', 'the', '2nd', 'time', 'this', 'has', 'happened', 'and', 'i', 'am', 'so', 'sick', 'of', 'it', '!']]]


In [9]:
#Printing examples of multi-word hashtags (Doesn't work for multi sentence tweets)
f = open('training.json')
count = 1
for index,line in enumerate(f):
    if count >5:
        break
    original_tweet = json.loads(line)["text"]
    hashtags = re.findall("#\w+",original_tweet)
    if hashtags:
        for hashtag in hashtags:
            if len(maxmatch(hashtag[1:],dictionary)) > 1:
                #If the length of the array returned by the maxmatch function is greater than 1,
                #it means that the algorithm has detected a hashtag with more than 1 word inside. 
                print(str(count) + ". Original Tweet: " + original_tweet + "\nProcessed tweet: " + str(train_tweets[index]) + "\n")
                count += 1
                break

1. Original Tweet: If I make a game as a #windows10 Universal App. Will #xboxone owners be able to download and play it in November? @majornelson @Microsoft
Processed tweet: [['if', 'i', 'make', 'a', 'game', 'as', 'a', 'universal', 'app', '.'], ['will', 'owners', 'be', 'able', 'to', 'download', 'and', 'play', 'it', 'in', 'november', '?'], ['windows', '1', '0'], ['x', 'box', 'one']]

2. Original Tweet: Microsoft, I may not prefer your gaming branch of business. But, you do make a damn fine operating system. #Windows10 @Microsoft
Processed tweet: [['microsoft', ',', 'i', 'may', 'not', 'prefer', 'your', 'gaming', 'branch', 'of', 'business', '.'], ['but', ',', 'you', 'do', 'make', 'a', 'damn', 'fine', 'operating', 'system', '.'], ['Window', 's', '1', '0']]

3. Original Tweet: @MikeWolf1980 @Microsoft I will be downgrading and let #Windows10 be out for almost the 1st yr b4 trying it again. #Windows10fail
Processed tweet: [['i', 'will', 'be', 'downgrading', 'and', 'let', 'be', 'out', 'for', 

In [10]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

#To identify words appearing less than n times, we're creating a dictionary for the whole training set

total_train_bow = {}

for tweet in train_tweets:
    for segment in tweet:
        for token in segment:
            total_train_bow[token] = total_train_bow.get(token,0) + 1

#Function to convert pre_processed tweets to bag of words feature dictionaries
#Allows for options to remove stopwords, and also to remove words occuring less than n times in the whole training set.            
def convert_to_feature_dicts(tweets,remove_stop_words,n): 
    feature_dicts = []
    for tweet in tweets:
        # build feature dictionary for tweet
        feature_dict = {}
        if remove_stop_words:
            for segment in tweet:
                for token in segment:
                    if token not in stopwords and (n<=0 or total_train_bow[token]>=n):
                        feature_dict[token] = feature_dict.get(token,0) + 1
        else:
            for segment in tweet:
                for token in segment:
                    if n<=0 or total_train_bow[token]>=n:
                        feature_dict[token] = feature_dict.get(token,0) + 1
        feature_dicts.append(feature_dict)
    return feature_dicts

In [11]:
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer()

#Conversion to feature dictionaries
train_set = convert_to_feature_dicts(train_tweets,True,2)

dev_data = preprocess_file('develop.json')

dev_set = convert_to_feature_dicts(dev_data[0],False,0)

#Conversion to sparse representations
training_data = vectorizer.fit_transform(train_set)

development_data = vectorizer.transform(dev_set)

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

#Grid used to test the combinations of parameters
tree_param_grid = [
    {'criterion':['gini','entropy'], 'min_samples_leaf': [75,100,125,150,175], 'max_features':['sqrt','log2',None],
    }
]

tree_clf = GridSearchCV(DecisionTreeClassifier(),tree_param_grid,cv=10,scoring='accuracy')

tree_clf.fit(training_data,train_data[1])

print("Optimal parameters for DT: " + str(tree_clf.best_params_)) #To print out the best discovered combination of the parameters

tree_predictions = tree_clf.predict(development_data)

print("\nDecision Tree Accuracy: " + str(accuracy_score(dev_data[1],tree_predictions)))


Optimal parameters for DT: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 75}

Decision Tree Accuracy: 0.48004373974849646


In [13]:
from sklearn.dummy import DummyClassifier

#The dummy classifier below always predicts the most frequent class, as specified in the strategy. 
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(development_data,dev_data[1])
dummy_predictions = dummy_clf.predict(development_data)

print("\nMost common class baseline accuracy: " + str(accuracy_score(dev_data[1],dummy_predictions)))



Most common class baseline accuracy: 0.42044833242208857


In [14]:
from sklearn.linear_model import LogisticRegression

log_param_grid = [
    {'C':[0.012,0.0125,0.130,0.135,0.14],
     'solver':['lbfgs'],'multi_class':['multinomial']
    }
]

log_clf = GridSearchCV(LogisticRegression(max_iter=400),log_param_grid,cv=10,scoring='accuracy')

log_clf.fit(training_data,train_data[1])

log_predictions = log_clf.predict(development_data)

print("Optimal parameters for LR: " + str(log_clf.best_params_))

print("Logistic Regression Accuracy: " + str(accuracy_score(dev_data[1],log_predictions)))


Optimal parameters for LR: {'C': 0.012, 'multi_class': 'multinomial', 'solver': 'lbfgs'}
Logistic Regression Accuracy: 0.4931656642974303


In [17]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import random

swn_positive = []

swn_negative = []

#Function supplied with the assignment, not described below.
def get_polarity_type(synset_name):
    swn_synset =  swn.senti_synset(synset_name)
    if not swn_synset:
        return None
    elif swn_synset.pos_score() > swn_synset.neg_score() and swn_synset.pos_score() > swn_synset.obj_score():
        return 1
    elif swn_synset.neg_score() > swn_synset.pos_score() and swn_synset.neg_score() > swn_synset.obj_score():
        return -1
    else:
        return 0


for synset in wn.all_synsets():      
    
    # count synset polarity for each lemma
    pos_count = 0
    neg_count = 0
    neutral_count = 0
    
    for lemma in synset.lemma_names():
        for syns in wn.synsets(lemma):
            if get_polarity_type(syns.name())==1:
                pos_count+=1
            elif get_polarity_type(syns.name())==-1:
                neg_count+=1
            else:
                neutral_count+=1
    
    if pos_count > neg_count and pos_count >= neutral_count: #>=neutral as words that are more positive than negative, 
                                                                #despite being equally neutral might belong to positive list (explain)
        swn_positive.append(synset.lemma_names()[0])
    elif neg_count > pos_count and neg_count >= neutral_count:
        swn_negative.append(synset.lemma_names()[0])       

swn_positive = list(set(swn_positive))
swn_negative = list(set(swn_negative))
            
            
print('Positive words: ' + str(random.sample(swn_positive,5)))

print('Negative Words: ' + str(random.sample(swn_negative,5)))

Positive words: ['backstairs', 'sanctimoniousness', 'utilizer', 'gentlemanlike', 'wellbeing']
Negative Words: ['crabbiness', 'inexcusable', 'ulalgia', 'frightful', 'brown_rot']


In [18]:
import gensim
from nltk.data import find
import random

positive_seeds = ["good","nice","excellent","positive","fortunate","correct","superior","great"]
negative_seeds = ["bad","nasty","poor","negative","unfortunate","wrong","inferior","awful"]

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample,binary=False)

wv_positive = []
wv_negative = []

for word in model.index_to_key:
    try:
        word=word.lower()
    
        pos_score = 0.0
        neg_score = 0.0
    
        for seed in positive_seeds:
            pos_score = pos_score + model.similarity(word,seed)
    
        for seed in negative_seeds:
            neg_score = neg_score + model.similarity(word,seed)
        
        avg = (pos_score - neg_score)/16 #Total number of seeds is 16
    
        if avg>0.03:
            wv_positive.append(word)
        elif avg<-0.03:
            wv_negative.append(word)
    except:
        pass
print('Positive words: ' + str(random.sample(wv_positive,5)))

print('Negative Words: ' + str(random.sample(wv_negative,5)))

Positive words: ['basked', 'mineralogy', 'enable', 'expand', 'glimpses']
Negative Words: ['dirty', 'paralyze', 'tobacco', 'dealt', 'sack']


In [19]:
from nltk.corpus import opinion_lexicon
import math

positive_words = opinion_lexicon.positive()
negative_words = opinion_lexicon.negative()

#Calculate the percentage of words in the manually annotated lexicon set, that also appear in an automatic lexicon.
def get_perc_manual(manual_pos,manual_neg,auto_pos,auto_neg):
    return len(set(manual_pos+manual_neg).intersection(set(auto_pos+auto_neg)))/len(manual_pos+manual_neg)*100

print("% of words in manual lexicons, also present in the automatic lexicon")
print("First automatic lexicon: "+ str(get_perc_manual(positive_words,negative_words,swn_positive,swn_negative)))
print("Second automatic lexicon: "+ str(get_perc_manual(positive_words,negative_words,wv_positive,wv_negative)))

#Calculate the accuracy of words in the automatic lexicon. Assuming that the manual lexicons are accurate, it calculates the percentage of words that occur in both positive and negative (respectively) lists of automatic and manual lexicons.
def get_lexicon_accuracy(manual_pos,manual_neg,auto_pos,auto_neg):
    common_words = set(manual_pos+manual_neg).intersection(set(auto_pos+auto_neg))-set(negative_seeds)-set(positive_seeds)
    return (len(set(manual_pos) & set(auto_pos) & common_words)+len(set(manual_neg) & set(auto_neg) & common_words))/len(common_words)*100

print("\nAccuracy of lexicons: ")
print("First automatic lexicon: "+ str(get_lexicon_accuracy(positive_words,negative_words,swn_positive,swn_negative)))
print("Second automatic lexicon: "+ str(get_lexicon_accuracy(positive_words,negative_words,wv_positive,wv_negative)))


% of words in manual lexicons, also present in the automatic lexicon
First automatic lexicon: 13.610251878038001
Second automatic lexicon: 37.796435410222415

Accuracy of lexicons: 
First automatic lexicon: 84.46389496717724
Second automatic lexicon: 98.94159153273226


In [20]:
#All lexicons are converted to sets for faster preprocessing.
manual_pos_set = set(positive_words)
manual_neg_set = set(negative_words)

syn_pos_set = set(swn_positive)
syn_neg_set = set(swn_negative)

wordvec_pos_set = set(wv_positive)
wordvec_neg_set = set(wv_negative)

#Function to calculate the polarity score of a sentence based on the frequency of positive or negative words. 
def get_polarity_score(sentence,pos_lexicon,neg_lexicon):
    pos_count = 0
    neg_count = 0
    for word in sentence:
        if word in pos_lexicon:
            pos_count+=1
        if word in neg_lexicon:
            neg_count+=1
    if pos_count>neg_count:
        return 1
    elif neg_count>pos_count:
        return -1
    else:
        return 0
    

#Function to calculate the score for each tweet, and compare it against the actual labels of the dataset and calculate/count the accuracy score. 
def data_polarity_accuracy(dataset,datalabels,pos_lexicon,neg_lexicon):
    accuracy_count = 0
    for index,tweet in enumerate(dataset):
        if datalabels[index]==get_polarity_score([word for sentence in tweet for word in sentence],pos_lexicon,neg_lexicon):
            accuracy_count+=1
    return (accuracy_count/len(dataset))*100
        
print("Manual lexicon accuracy: "+str(data_polarity_accuracy(dev_data[0],dev_data[1],manual_pos_set,manual_neg_set))      )
print("First auto lexicon accuracy: "+str(data_polarity_accuracy(dev_data[0],dev_data[1],syn_pos_set,syn_neg_set))      )
print("Second auto lexicon accuracy: "+str(data_polarity_accuracy(dev_data[0],dev_data[1],wordvec_pos_set,wordvec_neg_set)))


Manual lexicon accuracy: 45.2159650082012
First auto lexicon accuracy: 42.3728813559322
Second auto lexicon accuracy: 45.16129032258064


In [21]:
def convert_to_feature_dicts_v2(tweets,manual,first,second,remove_stop_words,n): 
    feature_dicts = []
    for tweet in tweets:
        # build feature dictionary for tweet
        feature_dict = {}
        if remove_stop_words:
            for segment in tweet:
                for token in segment:
                    if token not in stopwords and (n<=0 or total_train_bow[token]>=n):
                        feature_dict[token] = feature_dict.get(token,0) + 1
        else:
            for segment in tweet:
                for token in segment:
                    if n<=0 or total_train_bow[token]>=n:
                        feature_dict[token] = feature_dict.get(token,0) + 1
        if manual == True:
            feature_dict['manual_polarity'] = get_polarity_score([word for sentence in tweet for word in sentence],manual_pos_set,manual_neg_set)
        if first == True:
            feature_dict['synset_polarity'] = get_polarity_score([word for sentence in tweet for word in sentence],syn_pos_set,syn_neg_set)
        if second == True:
            feature_dict['wordvec_polarity'] = get_polarity_score([word for sentence in tweet for word in sentence],wordvec_pos_set,wordvec_neg_set)
    
        feature_dicts.append(feature_dict)      
    return feature_dicts

In [22]:
training_set_v2 = convert_to_feature_dicts_v2(train_tweets,True,False,True,True,2)

training_data_v2 = vectorizer.fit_transform(training_set_v2)


In [23]:
dev_set_v2 = convert_to_feature_dicts_v2(dev_data[0],True,False,True,False,0)

development_data_v2 = vectorizer.transform(dev_set_v2)

log_clf_v2 = LogisticRegression(C=0.012,solver='lbfgs',multi_class='multinomial')

log_clf_v2.fit(training_data_v2,train_data[1])

log_predictions_v2 = log_clf_v2.predict(development_data_v2)

print("Logistic Regression V2 (with polarity scores) Accuracy: " + str(accuracy_score(dev_data[1],log_predictions_v2)))


Logistic Regression V2 (with polarity scores) Accuracy: 0.5079278294149808


Though minimal, there was some improvement indeed in the classifier by integrating the polarity data.

This concludes our project of building a very basic 3-way polarity classifier for tweets.