-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment.py
133 lines (112 loc) · 4.05 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import nltk
from nltk.classify import SklearnClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
import cPickle as pickle
import time
"""READING IN FROM FILES"""
def read_train(source):
# open data file
# discard header line
fdata = open(source, 'r')
fdata.readline()
phrases = []
phrase_to_id = {}
for data in fdata:
data = data.split('\t')
phrases.append(([token.lower().rstrip() for token in data[2].split(' ')],int(data[3].rstrip())))
phrase_to_id[data[2].lower().rstrip()] = data[0]
return phrases, id_to_phrase
def read_data(source='', which='train'):
phrases, phrase_to_id = {
'train': read_train,
'dev': 'dev', # unimplemented TODO
'test': 'test' # unimplemented TODO
}[which](source)
return phrases, phrase_to_id
fdata.close()
"""WORD AND FEATURE EXTRACTION"""
def get_phrase_list(sent_tups, with_label=False):
phrases = []
for s_tup in sent_tups:
if not with_label:
phrases.append(' '.join(s_tup[0]))
else:
phrases.append((' '.join(s_tup[0]), s_tup[1]))
return phrases
def get_all_words(sent_tups):
all_words = []
for sent_tup in sent_tups:
words, _ = sent_tup
all_words.extend(words)
return all_words
def get_features(words):
words = nltk.FreqDist(words)
features = words.keys()
return features
global_features = {}
adjectives
def extract_features(doc):
words = set(doc)
features = {}
for i, word in enumerate(global_features):
features[i] = int(word in words)
#features['contains(%s)' % word] = (word in words)
for i, word in enumerate(adjective):
features[i+len(global_features)] = int(adj in words)
return features
def main(which='NB'):
print 'reading training data'
training_data, phrase_to_id = read_data(source='dat/train.tsv')
print 'getting features'
global global_features
global adjectives
global_features = get_features(get_all_words(training_data))
with open('adj', 'r') as adj_file:
for adj in adj_file:
adjectives.append(adj.lower().rstrip())
print 'entering switch'
if which == 'NB':
training_set = nltk.classify.util.apply_features(extract_features, get_phrase_list(training_data, True))
print 'moving to classifier creation'
start = time.clock()
classifier = nltk.NaiveBayesClassifier.train(training_set)
print 'classfier total time: ', str(time.clock() - start)
#classifier = SklearnClassifier(BernoulliNB()).train(training_set)
pickle.dump(classifier, open('classifier.pickle', 'w'))
text = raw_input('Next test (q to quit):')
while text != 'q':
print classifier.classify(extract_features(text.split()))
text = raw_input('Next test (q to quit):')
elif which == 'SGD':
print 'extracting features'
training_set = nltk.classify.util.apply_features(extract_features, get_phrase_list(training_data))
training_list = []
for d in training_set:
sample = []
for k, v in d.iteritems():
sample.append(v)
training_list.append(sample)
label_set = [int(tup[1]) for tup in training_data]
print 'moving to classifier creation'
clf = SGDClassifier(loss="hinge", penalty="l2")
print 'moving to training'
clf.fit(training_list, label_set)
pickle.dump(clf, open('sgd_sent.pickle', 'w'))
print 'moving to prediction'
pred =[]
pred.append('i hate everyhing')
pred.append('i love everything')
pred_set = nltk.classify.util.apply_features(extract_features, pred)
pred_list = []
print pred_set
for d in pred_set:
inst_list = []
for k, v in d.iteritems():
inst_list.append(v)
pred_list.append(inst_list)
print pred_list
print clf.predict(pred_list)
main(which='SGD')