In [87]:
from nltk.sentiment.util import *

from nltk.corpus import opinion_lexicon

from nltk.corpus import TwitterCorpusReader
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import random
import pickle

In [64]:
#tweets: list of strings ; lemmatizer: nltk Lemmatizer ; stemmer : nltk Stemmer
#remove stopword and tokenize strings. lemmatize/stemming if lemmatizer/stemmer not None
def preprocessString(tweets, lemmatizer, stemmer):
    
    #stopword
    tweets = [w for w in tweets if w.lower() not in stopwords.words('english')]
    #tokenize
    tokenizer = TweetTokenizer()
    tweets = [tokenizer.tokenize(tweet) for tweet in tweets]
    #lower
    for i in range(len(tweets)):
        tweets[i] = [t.lower() for t in tweets[i]]
    
    if lemmatizer != None:
        for i in range(len(tweets)):
            #lemmatization 
            tweets[i] = [lemmatizer.lemmatize(t) for t in tweets[i]]
    if stemmer != None:
        for i in range(len(tweets)):
            #stemming 
            tweets[i] = [stemmer.stem(t) for t in tweets[i]]
    
    #Collocations, Bigrams, Trigrams
    #Chunking
            
    return tweets

#tweets: list of objects, tag: tag in string 
def add_tags(tweets, tag):
    return [[tweet, tag] for tweet in tweets]
        
    

In [65]:
#tweet data source (please follow https://www.nltk.org/data.html to download data)
from nltk.corpus import twitter_samples

#load
neg_tweet = twitter_samples.strings(fileids = 'negative_tweets.json')
pos_tweet = twitter_samples.strings(fileids = 'positive_tweets.json')

In [66]:
#lemmatization
wnl = nltk.WordNetLemmatizer()
#stemming
pstemmer = nltk.PorterStemmer()

#process tweets
neg_tweetPro = preprocessString(neg_tweet, wnl, pstemmer)
pos_tweetPro = preprocessString(pos_tweet, wnl, pstemmer)

In [67]:
#add tag
neg_tweetTag = add_tags(neg_tweetPro, 0)
pos_tweetTag = add_tags(pos_tweetPro, 1)
docs = neg_tweetTag + pos_tweetTag
random.shuffle(docs)

In [68]:
#train/test split
numTrainTweets = int(len(docs)*0.9)
train_docs = docs[:numTrainTweets]
test_docs = docs[numTrainTweets:]
train_data = [data[0] for data in train_docs]
train_label = [data[1] for data in train_docs]
test_data = [data[0] for data in test_docs]
test_label = [data[1] for data in test_docs]

In [94]:
#vectorizer
def identity(x):
    return x
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(preprocessor=identity, tokenizer=identity)
trainX = vectorizer.fit_transform(train_data)

#feature selection?

In [107]:
#save vectorizer?
with open("tfidf_veczr.pkl","wb") as file:
    pickle.dump(vectorizer,file)

In [104]:
#classifier NB
from sklearn.naive_bayes import MultinomialNB
classifierNB = MultinomialNB()
classifierNB.fit(trainX,train_label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [105]:
#test
testX = vectorizer.transform(test_data)
classifierNB.score(testX,test_label)

0.94699999999999995

In [106]:
#save model?
pickle.dump(classifierNB,open("NB_clf.pkl","wb"))


In [108]:
#classifier DecisionTree
from sklearn import tree
classifierTree = tree.DecisionTreeClassifier()
classifierTree.fit(trainX,train_label)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [109]:
#test
testX = vectorizer.transform(test_data)
classifierTree.score(testX,test_label)

0.998

In [110]:
#save model?
pickle.dump(classifierTree,open("DT_clf.pkl","wb"))


In [121]:
#visualize?
import graphviz
dot_data = tree.export_graphviz(classifierTree, out_file=None)
graph = graphviz.Source(dot_data) 
graph

ImportError: No module named 'graphviz'

In [112]:
#classifier log reg
from sklearn.linear_model import LogisticRegression
classifierLog = LogisticRegression()
classifierLog.fit(trainX,train_label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [113]:
#test
testX = vectorizer.transform(test_data)
classifierLog.score(testX,test_label)

1.0

In [114]:
#save model?
pickle.dump(classifierLog,open("Log_clf.pkl","wb"))


In [118]:
#classifier svm
from sklearn import svm
classifierSVM = svm.LinearSVC()
classifierSVM.fit(trainX,train_label)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [119]:
#test
testX = vectorizer.transform(test_data)
classifierSVM.score(testX,test_label)

1.0

In [120]:
#save model?
pickle.dump(classifierSVM,open("SVM_clf.pkl","wb"))
