Import the required packages / modules.

In [22]:
import nltk
import re
import pickle
nltk.download("stopwords")
nltk.download("twitter_samples")
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peiyuns/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/peiyuns/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


In [3]:
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [4]:
# Read positive and negative tweets
positive_tweets = twitter_samples.tokenized("positive_tweets.json")
negative_tweets = twitter_samples.tokenized("negative_tweets.json")

In [5]:
stop_words = set(stopwords.words('english'))
vec = DictVectorizer()  # Vectorizer

In [65]:
def process_tweets(tweets):
    new_tweets = []
    for tweet in tweets:
        new_tweet = []
        for word in tweet:
            
            if word[0] == "#":
                i = 1
                if i == len(word):
                    continue
                while(word[i] == "#"):
                      i += 1
                if "#" in word[i:]:
                      continue
                new_tweet.append("#" + word[i:])
                
            if not (word in stop_words or re.search("[^a-zA-Z]",word)):  # if not stopwords and non-alphabetic
                new_tweet.append(word)
        new_tweets.append(new_tweet)
    
    # split into training, development, and testing
    training, develop_test = train_test_split(new_tweets, test_size=0.2)
    development, testing = train_test_split(develop_test, test_size=0.5)   
    
    return (training, development, testing)

In [66]:
# Excluding hashtags containing stop words and non-alphabetic chars
(positive_training, positive_develop, positive_test)  = process_tweets(positive_tweets)
(negative_training, negative_develop, negative_test)  = process_tweets(negative_tweets)

In [67]:
# Get array of dictionary according to the list of tweets 
def getDict(tweets):
    dict_arr = []
    for tweet in tweets:
        tweet_dict = {}
        for word in tweet:
            if word in tweet_dict:
                tweet_dict[word] += 1
            else:
                tweet_dict[word] = 1
        dict_arr.append(tweet_dict)
    return dict_arr

In [68]:
# Training, development, testing sets and targets
X_train = vec.fit_transform(getDict(positive_training + negative_training))
y_train = len(positive_training)*[1] + len(negative_training)*[0]

X_dev = vec.transform(getDict(positive_develop + negative_develop))
y_dev = len(positive_develop)*[1] + len(negative_develop)*[0]

X_test = vec.transform(getDict(positive_test + negative_test))
y_test = len(positive_test)*[1] + len(negative_test)*[0]

In [69]:
# Hyperparameter sets
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
penaltys = ['l1', 'l2']
Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
kernels = ["linear", "poly", "rbf", "sigmoid"]

# Init best hyperparameters
best_bayes_params = {}
best_bayes_score = 0
best_logistic_params = {}
best_logistic_score = 0

print("Naive Bayes hyperparameters:")
# Tune naive bayes hyperparameters
for alpha in alphas:
    bayes = MultinomialNB(alpha = alpha) # Create classifier
    bayes.fit(X_train, y_train) # Train model
    score = bayes.score(X_dev, y_dev) # Calculate score 
    print ("alpha = %7.3f, Score = %.4f" % (alpha, score)) # Print result
    
    # Check if better
    if score > best_bayes_score:
        best_bayes_params = {'alpha':alpha}
        best_bayes_score = score

print("\nLogistic Regression hyperparameters")
# Tune logistic regression hyperparameters
for C in Cs:
    for penalty in penaltys:
        logistic = LogisticRegression(C = C, penalty = penalty) # Create classifier
        logistic.fit(X_train, y_train) # Train model
        score = logistic.score(X_dev, y_dev) # Calculate score  
        print("penalty = %s, C = %8.3f, Score = %.4f" % (penalty, C, score))
    
        # Check if better
        if score > best_logistic_score:
            best_logistic_params = {'C':C, 'penalty':penalty}
            best_logistic_score = score
            
# print("\SVC hyperparameters")
# # Tune SCV hyperparameters
# for C in Cs:
#     for kernel in kernels:
#         svc = SVC(C = C, kernel = kernel) # Create classifier
#         svc.fit(X_train, y_train) # Train model
#         score = svc.score(X_dev, y_dev) # Calculate score  
#         print("kernal = %s, C = %8.3f, Score = %.4f" % (kernel, C, score))
    
#         # Check if better
#         if score > best_logistic_score:
#             best_svc_params = {'C':C, 'kernel':kernel}
#             best_svc_score = score
        
print("\nNaive Bayes: best parameter: %s, score = %f" % (str(best_bayes_params), best_bayes_score))
print("Logistic Regression: best parameter: %s, score = %f" % (str(best_logistic_params), best_logistic_score))
# print("SVC: best parameter: %s, score = %f" % (str(best_svc_params), best_svc_score))

Naive Bayes hyperparameters:
alpha =   0.001, Score = 0.7240
alpha =   0.010, Score = 0.7260
alpha =   0.100, Score = 0.7330
alpha =   1.000, Score = 0.7370
alpha =  10.000, Score = 0.7360
alpha = 100.000, Score = 0.7180

Logistic Regression hyperparameters
penalty = l1, C =    0.001, Score = 0.5000
penalty = l2, C =    0.001, Score = 0.6010
penalty = l1, C =    0.010, Score = 0.5500
penalty = l2, C =    0.010, Score = 0.6890




penalty = l1, C =    0.100, Score = 0.6670
penalty = l2, C =    0.100, Score = 0.7330
penalty = l1, C =    1.000, Score = 0.7510
penalty = l2, C =    1.000, Score = 0.7460
penalty = l1, C =   10.000, Score = 0.7480
penalty = l2, C =   10.000, Score = 0.7500
penalty = l1, C =  100.000, Score = 0.7310
penalty = l2, C =  100.000, Score = 0.7300
penalty = l1, C = 1000.000, Score = 0.7110
penalty = l2, C = 1000.000, Score = 0.7210

Naive Bayes: best parameter: {'alpha': 1}, score = 0.737000
Logistic Regression: best parameter: {'C': 1, 'penalty': 'l1'}, score = 0.751000


In [70]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Classifiers
bayes_clf = MultinomialNB(alpha = best_bayes_params['alpha'])
logistic_clf = LogisticRegression(C = best_logistic_params['C'], penalty = best_logistic_params['penalty'])
# svc_clf = SVC(C = best_svc_params["C"], kernel = best_svc_params["kernel"])


# Train classifiers
bayes_clf.fit(X_train, y_train)
logistic_clf.fit(X_train, y_train)
# svc_clf.fit(X_train, y_train)

print ("Naive Bayes: f-score = %.4f, accuracy = %.4f" % (f1_score(bayes_clf.predict(X_test),y_test, average = 'macro'), accuracy_score(bayes_clf.predict(X_test),y_test)))
print ("Logistic Regression: f-score = %.4f, accuracy = %.4f" % (f1_score(logistic_clf.predict(X_test),y_test, average = 'macro'), accuracy_score(logistic_clf.predict(X_test),y_test)))
# print ("SVC: f-score = %.4f, accuracy = %.4f" % (f1_score(bayes_clf.predict(X_test),y_test, average = 'macro'), accuracy_score(bayes_clf.predict(X_test),y_test)))


Naive Bayes: f-score = 0.7615, accuracy = 0.7620
Logistic Regression: f-score = 0.7230, accuracy = 0.7250




In [71]:
from sklearn.tree import DecisionTreeClassifier

In [75]:
criterions = ["gini", "entropy"]

for criterion in criterions:
# Classifiers
    dt_clf = DecisionTreeClassifier(criterion = criterion)
    dt_clf.fit(X_train, y_train)
    print ("Decision Tree: f-score = %.4f, accuracy = %.4f" % (f1_score(dt_clf.predict(X_test),y_test, average = 'macro'), accuracy_score(dt_clf.predict(X_test),y_test)))

Decision Tree: f-score = 0.6817, accuracy = 0.6830
Decision Tree: f-score = 0.6975, accuracy = 0.6990


In [43]:
# save the classifier and vectorizer
pickle.dump(logistic_clf, open("Logistic_{C0.1}_{l1}.pk1", "wb"))  
pickle.dump(vec, open("DictVectorizer.pk1", "wb"))

In [44]:
# load the classifier and vectorizer
logistic_clf = pickle.load(open("Logistic_{C0.1}_{l1}.pk1", "rb"))  
vec = pickle.load(open("DictVectorizer.pk1", "rb"))

In [41]:
logistic_clf

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [42]:
vec

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)