In [1]:
import re
import csv
from sys import argv

#script, file_ = argv

def processRow(row):
	
	
	tweet = row[2]
	#Lower case
	tweet.lower()
	#convert any url to URL
	tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
	#Convert any @Username to "AT_USER"
	tweet = re.sub('@[^\s]+','AT_USER',tweet)
	#Remove additional white spaces
	tweet = re.sub('[\s]+', ' ', tweet)
	tweet = re.sub('[\n]+', ' ', tweet)
	#Remove not alphanumeric symbols white spaces
	tweet = re.sub(r'[^\w]', ' ', tweet)
	#Replace #word with word
	tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
	#Remove :( or :)
	tweet = tweet.replace(':)','')
	tweet = tweet.replace(':(','')
	#trim
	tweet = tweet.strip('\'"')

	row[2] = tweet

	return row

# end



In [2]:
#start replaceTwoOrMore
def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)
#end

def getStopWordList(stopWordFile):
	stopwords = []
	stopwords.append("AT_USER")
	stopwords.append("URL")

	with open(stopWordFile, 'r') as f:
		reader = csv.reader(f)
		for w in reader:

			stopwords.append(w[0])

	return stopwords
#end


def getFeatureVector(tweet, stopWordFile):
	features = []

	stop_words = getStopWordList(stopWordFile)

	words = tweet.split()
	for w in words:

		w = replaceTwoOrMore(w)

		#strip digits
		w = w.strip('0123456789')

		#strip punctuation
		w = w.strip('\'"!?,.')

		if (w == ""):
			continue
		elif(w in stop_words):
			#print w
			continue
		else:
			features.append(w.lower())

	return features
#end

In [3]:
import nltk
from nltk.metrics import *


import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
import nltk.metrics
import collections



stop_words_file_name ='stopwords.txt'

word_features = []

def read_tweets(tweet_file):

	features = []
	tweets = []

	with open(tweet_file,'r') as csv_file:
		csv_reader = csv.reader(csv_file)

		for l in csv_reader:
				
				if l[0] == "TweetID":
					continue

				new_row = processRow(l)
				
				if len(new_row) < 4:
					print ("Malformed Data")
					continue
				
				features = getFeatureVector(new_row[2], stop_words_file_name)

				tweets.append(
			 		( [f.strip("\'") for f in features],
			 		 new_row[3]
			 		))

	return tweets
#end

def extract_features(document):
	document_words = set(document)
	features = {}
	for w in word_features:
		features['contains %s'% w] = (w in document_words)
	return features

if __name__ == '__main__':
	
	neg_tweets = read_tweets('dataset/iphone6-negative.csv')
	pos_tweets = read_tweets('dataset/iphone6-positive.csv')

	cutoff = 0

	print(len(neg_tweets))
	print(len(pos_tweets))
	if(len(neg_tweets) > len(pos_tweets)):
		cutoff = len(pos_tweets)*4/5
	else:
		cutoff = len(neg_tweets)*4/5
 
	tweets = neg_tweets[:int(cutoff)] + pos_tweets[:int(cutoff)]
	test_tweets = neg_tweets[int(cutoff):] + pos_tweets[int(cutoff):]
	
	print ('train on %d instances, test on %d instances' % (len(tweets), len(test_tweets)))
 
	all_words = []
	words_frequency = []

	#Get all the words
	for (words, sentiment) in tweets:
		all_words.extend(words)

	#extract the features
	wordlist = nltk.FreqDist(all_words)
	word_features = wordlist.keys()

	training_set = nltk.classify.apply_features(extract_features, tweets)

    
	#train the classifier
	classifier = NaiveBayesClassifier.train(training_set)

	refsets  = { 'positive': set([]), 'negative':set([])}
	testsets = { 'positive': set([]), 'negative':set([])}

   
	classifier.show_most_informative_features()
    
	for i, (feats, label) in enumerate(test_tweets):   
		#print(feats,label,classifier.classify(extract_features(feats)))
		refsets[label].add(i)
		testsets[classifier.classify(extract_features(feats))].add(i)

    
	from nltk.metrics import precision,recall,f_measure
    
	print ('pos precision:',precision(refsets['positive'], testsets['positive']))
	print ('pos recall:', recall(refsets['positive'], testsets['positive']))
	print ('pos F-measure:',f_measure(refsets['positive'], testsets['positive']))
	print ('neg precision:',precision(refsets['negative'], testsets['negative']))
	print ('neg recall:', recall(refsets['negative'], testsets['negative']))
	print ('neg F-measure:',f_measure(refsets['negative'], testsets['negative']))


#end

501
501
train on 800 instances, test on 202 instances
Most Informative Features
           contains free = True           positi : negati =    200.3 : 1.0
          contains check = True           positi : negati =    200.3 : 1.0
             contains bi = True           positi : negati =    199.7 : 1.0
            contains thx = True           positi : negati =    199.7 : 1.0
             contains do = True           positi : negati =    125.7 : 1.0
          contains hurry = True           positi : negati =     81.7 : 1.0
             contains hi = True           positi : negati =     39.9 : 1.0
          contains wanna = True           positi : negati =     20.8 : 1.0
            contains ios = True           negati : positi =     12.2 : 1.0
            contains don = True           negati : positi =     11.7 : 1.0
pos precision: 1.0
pos recall: 0.693069306930693
pos F-measure: 0.8187134502923976
neg precision: 0.7651515151515151
neg recall: 1.0
neg F-measure: 0.8669527896995707
