In [1]:
import os
import sys
import nltk
import pickle
from bootstrapping import Bootstrapping
from pos import SequentialTagger
from hp_classifiers import HpObj, HpSubj
from polarity import PolarityClassifier  
from replacer import RepeatReplacer
from terminal_colors import Tcolors

DEBUG = False

class Sentiment:
    """
        Sentiment: Analyses the global sentiment of given text regions  
        that are decomposed to sentences, using bootstrapping methods for 
        subjectivity and polarity classification. All sub modules except 
        from POS tagging are learning by experience.
    """
    
    def __init__(self):
        self.pos_tagger = SequentialTagger()
        self.hp_obj = HpObj(debug=DEBUG)
        self.hp_subj = HpSubj(debug=DEBUG)
        self.lexicon = self.hp_obj.lexicon
        self.bootstrapping = Bootstrapping(self.hp_obj, self.hp_subj, self.pos_tagger, debug=DEBUG) 
        self.sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        self.total_sentences = ["good","bad"]
        self.total_sentiments = ["positive","negative"]
        
    def analyze(self, clean_text_areas):
        """
            Analysis of text regions using the following order: Each sentence per
            region is passed from the subjectivity classification using bootstrapping
            method and then if it turns out to be subjective it is passed 
            from the polarity classification using bootstrapping method also.
            Finally, it results to a decision for the sentiment of the sentence
            and the overall sentiment of the regions. 
        """ 
        if len(clean_text_areas) > 0:  
            for clean_text in clean_text_areas:
                # Sentence detection
                clean_text = self.normalize(clean_text)
                try:
                    sentences = self.sentence_tokenizer.tokenize(clean_text)
                except:
                    return {}
                sentiments = [] 
                scores = []
                nscores = []
                results = {'positive':{'count' : 0, 'score' : 0, 'nscore' : 0},
                           'neutral':{'count' : 0, 'score' : 0, 'nscore' : 0},
                           'negative':{'count' : 0, 'score' : 0, 'nscore' : 0}}
                
                print
                print Tcolors.ACT + " Checking block of text:"
                for i, sentence in enumerate(sentences):
                    print "[" + str(i+1) + "] " + sentence
                for i, sentence in enumerate(sentences):
                    # Proceed to subjectivity classification (bootstrapping procedure).
                    # (This step could be skipped in case you deal with subjective sentences only.)
                    sentiment = ""
                    previous = ""
                    next = ""
                    score = 0
                    nscore = 0
                    if i == 0 and i + 1 < len(sentences): 
                        next = sentences[i+1] 
                    elif i != 0 and i < len(sentences):
                        if i + 1 != len(sentences):
                            next = sentences[i+1]
                        previous = sentences[i-1] 
                     
                    if DEBUG: print Tcolors.ACT + " Analyzing subjectivity..." 
                    result = self.bootstrapping.classify(sentence, previous, next) 
                    if result is None:
                        res = 'Not found!'
                    else:
                        res = result
                    if DEBUG:
                        print Tcolors.RES + Tcolors.OKGREEN + " " + res + Tcolors.ENDC
                        print
                    
                    # If sentence is subjective 
                    if result == 'subjective' or result is None:
                        # Proceed to polarity classification
                        if DEBUG: print Tcolors.ACT + " Analyzing sentiment..."
                        polarity_classifier = PolarityClassifier(self.pos_tagger, self.lexicon, debug=DEBUG)
                        sentiment, score, nscore = polarity_classifier.classify(sentence)
                        if DEBUG: print Tcolors.RES + Tcolors.OKGREEN + " " + sentiment + Tcolors.ENDC
                    # If sentence is objective
                    elif result == 'objective':
                        sentiment = 'neutral'  
                    
                    # Collect high-confidence training instances for SVM classifier.
                    # After the training, SVM can be used to classify new sentences.
                    #if sentiment != "neutral" and sentiment != "": 
                        #if sentiment != "neutral" and abs(nscore) >= 0.4:
                        #   self.total_sentences.append(sentence)
                        #   self.total_sentiments.append(sentiment)
                        
                    # Store results to memory
                    sentiments.append(sentiment)
                    scores.append(score)
                    nscores.append(nscore)
                    
                    # Update score
                    if results.has_key(sentiment):
                        results[sentiment]['nscore'] += nscore
                        results[sentiment]['score'] += score
                        results[sentiment]['count'] += 1 
                          
                print       
                print Tcolors.ACT + " Overall sentiment analysis:"
                print Tcolors.BGH
                print " Parts: ", len(sentences)
                print " Sentiments: ", sentiments
                print " Scores: ", scores 
                print " Results: ", "},\n\t    ".join((str)(results).split("}, "))
                print Tcolors.C

                pcount = results['positive']['count']
                ncount = results['negative']['count'] 
                total = len(sentences)
                print Tcolors.BG
                print " subjective".ljust(16,"-") + "> %.2f" % ((float)(pcount + ncount)*100 / total) + "%"
                print " objective".ljust(16,"-") + "> %.2f" % (100 - ((float)(pcount + ncount)*100 / total)) + "%"
                print Tcolors.C
                print Tcolors.BGGRAY
                for sense in results.keys():
                    count = results[sense]['count']
                    percentage = (float)(count) * 100 / (len(sentences))
                    print " " +sense.ljust(15,"-")+"> %.2f" % (percentage) + "%"
                  
                print Tcolors.C 
                ssum = sum(scores)
                confidence = " (%.2f, %.2f)" % (ssum,sum(nscores))
                final_sent = ""
                pos = True
                if results["negative"]["count"] > len(sentences)*1.0/3:
                    pos = False

                # Print total sentiment score and normalized sentiment score
                if ssum > 0 and pos:
                    print Tcolors.RES + Tcolors.OKGREEN + " positive" + confidence + Tcolors.C
                    final_sent = "positive"
                elif ssum == 0:
                    print Tcolors.RES + Tcolors.OKGREEN +  " neutral" + confidence + Tcolors.C
                    final_sent = "neutral"
                else:
                    print Tcolors.RES + Tcolors.OKGREEN +  " negative" + confidence + Tcolors.C
                    final_sent = "negative"
                print Tcolors.C
                
                # Store results
                total_result_hash = {'sentences' : sentences,
                                     'sentiments': sentiments,
                                     'scores'    : scores,
                                     'nscores'   : nscores,
                                     'results'   : results,
                                      'final' : {final_sent:{'score':ssum,'nscore':sum(nscores)}}} 
        # Train SVM classifier
        # self.train_svm()
        return total_result_hash
    
    def normalize(self, text):
        """
            Make some word improvements before feeding to the sentence tokenizer.
        """  
        rr = RepeatReplacer(self.lexicon)
        normalized_text = []
        final = None
        try:
            for word in text.split():
                normal = rr.replace(word.lower()) 
                if word[0].isupper(): 
                    normal = normal[0].upper() + normal[1:]
                
                normalized_text.append(normal)
                final = " ".join(normalized_text)
        except:
                final = text
    
        return final
                
    def train_svm(self):
        """
            Train SVM and store data with pickle.
        """
        self.svm.train(self.total_sentences, self.total_sentiments)
        t_output = open(self.svm_train_filename,'wb')
        l_output = open(self.svm_label_filename,'wb')
        pickle.dump(self.total_sentences,t_output)
        pickle.dump(self.total_sentiments,l_output)
        t_output.close()
        l_output.close()


if __name__ == '__main__':       
    sentiment = Sentiment()
    if len(sys.argv) > 1:
        sentiment.analyze([sys.argv[1]]) 
    else:
        sentiment.analyze([u"I was blown away by some of the comments here posted by people who is either uneducated, ignorant, self-righteous or all-of-the-above...I'm irritated and saddened as I read these \"finger-pointing\" or \"I'm right and you're wrong\" type of posts! Grow up folks! You're not in grade school...learn to embrace what is positive and move forward to do what is right... I have to give much love and respect to Ronny...your work is AMAZING!!! You cannot fathom how good I feel after I watched this video...regardless of history, politics, or whatever forces that makes what the mid-east today...for what you did and many of the followers in Iran and Palestine ...I BELIEVE TOMORROW WILL BE BETTER!!!!!! My name is Christopher Lee, I'm a nurse in Los Angeles and I {HEART} YOU ALL (especially to all of you beautiful and sweet ladies across the way)!!!!!"])      

ImportError: No module named bootstrapping

In [5]:
import re
import sys  
import pickle 
from terminal_colors import Tcolors
from pb_classifiers import PbSubj

class Bootstrapping:
    """
        Bootstrapping: Class performing the bootstrapping process for 
        subjectivity and objectivity classification of  sentences. The 
        method learns linguistically rich extraction patterns for subjective 
        (opinionated) expressions from unannotated data. The learned
        patterns are used to identify more subjective sentences that simple 
        high precision classifiers can't recall.
        Related paper:
        E. Riloff and J. Wiebe. Learning extraction patterns for subjective 
        expressions. In Proceedings of the 2003 conference on Empirical methods 
        in natural language processing, EMNLP '03, pages 105--112, 2003. ACL.
        
        Learned patterns structure
        e.g. {"<subj> was killed" : {'type' : 'subj',
                                     'display' : 'was killed',
                                     'subj_freq' : 10,
                                     'freq' : 20,
                                     'prob' : 0.5}}
         
    """
    
    def __init__(self, hp_obj, hp_subj, tagger, debug=False):
        # Syntactic forms for pattern extraction
        self.syntactic_forms = {"subj" : [["BE","VBN*|VBD*"],
                                          ["HAVE","BE","VB*"],
                                          ["VB*"],
                                          ["VB*","*","NN*|NP*|NC*"], 
                                          ["VB*","TO","VB*"],
                                          ["HAVE","TO","BE"],
                                          ["HAVE","NN*"]],
                                "dobj" : [["VB*"],
                                          ["TO","VB*"],
                                          ["VB*","TO","VB*"]],  
                                "np"   : [["NN","IN"],
                                          ["VB*","NN","IN"],
                                          ["BE","VBN","IN"],
                                          ["TO","VB","TO"]]
                                }
        self.filename = "stored/learned_patterns"
        try:
            self.learned_patterns = pickle.load(open(self.filename))
            print Tcolors.ADD + Tcolors.OKBLUE + " Loaded existing pattern knowledge!" + Tcolors.ENDC 
        except:
            print Tcolors.ACT + Tcolors.RED + " Existing pattern knowledge not found." + Tcolors.ENDC
            self.learned_patterns = {}
             
        # Part Of Speech Sequential Tagger (Unigram->Bigram->Trigram) 
        self.tagger = tagger
        # Sentence to be classified
        self.subjective = False
        self.objective = False
        # High precision objective classifier
        self.hp_obj = hp_obj
        # High precision subjective classifier
        self.hp_subj = hp_subj
        # Pattern-Based Subjective Classifier
        self.pb_subj = PbSubj(self.tagger, debug=debug)
        # Learned patterns
        self.patterns = {}
        self.debug = debug
            
    def classify(self, sentence, previous="", next=""):
        """
            Subjectivity classification using boostrapping method.
        """
        # STEP 1: Classify sentence with HP Subjective classifier
        self.subjective = self.hp_subj.classify(sentence) 
        # STEP 1: Get help from learned patterns
        if not self.subjective:
            if self.debug: print Tcolors.ACT + " Training pattern based classifier...\n"
            self.pb_subj.train(self.learned_patterns)
            found, self.subjective, obj = self.pb_subj.classify(sentence)
        
        if not self.subjective and not self.objective:
            # STEP 2: Classify sentence with HP Objective classifier
            self.objective = self.hp_obj.classify(sentence, previous, next)
        
        if self.subjective or self.objective:
            # STEP 3: Learn 
            self.learn_patterns_from(sentence) 
        else:
            # STEP 4: Classify based on learned patterns
            found, self.subjective, self.objective = self.pb_subj.classify(sentence)
            # Uncomment the two following to bootstrap further the subjective
            # sentences detected from the pattern-based classifier.
            # if self.subjective:
            #    self.learn_patterns_from(sentence)
        if self.subjective:
            return 'subjective'
        elif self.objective:
            return 'objective'
        else:
            return None
    
    def learn_patterns_from(self, sentence):
        """
            Learns extraction patterns associated with subjectivity
            from a given sentence.
        """
        tagged_sentence = self.tagger.tag(sentence)
        tags = []
        words = []
        if self.debug:
            print Tcolors.ACT + " Performing part of speech (POS) tagging..." + Tcolors.WARNING 
            print tagged_sentence
            print Tcolors.ENDC
        for (w,tag) in tagged_sentence:
            if tag is None:
                tag = ""
            tags.append(tag)
            words.append(w)

        self.trigger_patterns(tags, words)
                
    def match_until_next_nn(self, i, tags, words, form, key):
        """
            The hard job for triggering the syntactic forms :-)
        """
        LIMITER = 4
        BE = ['was','were','be','being','am','been','are','is']
        HAVE = ['has','have','had']
        matched = 0
        prev_matched = 0 
        positions_matched = []
        learned_pattern = []
        star = False 

        for j,ctag in enumerate(form):
            next = i + j + 1
            inner = 0 
            found = False
            while(not found and next < len(tags)):
                next += inner
                if next < len(words) and ctag == "VB*" and words[next] in HAVE:
                    next += 1
                    if next < len(words) and ctag == "VB*" and words[next] in BE:
                        next += 1
                elif next < len(words) and ctag == "VB*" and words[next] in BE:
                    next += 1
                if ctag == "*":
                    star = True  
                elif ctag.find("*") > -1:
                    ortags = ctag.split("|")
                    for ortag in ortags:
                        if next < len(tags) and tags[next].find(ortag.replace("*","")) > -1\
                           and next not in positions_matched: 
                            if star and inner < 2: 
                                matched += 1
                            matched += 1 
                            positions_matched.append(next) 
                            found = True
                elif ctag == "BE":  
                    if next < len(tags) and (tags[next].find("VB") > -1 or tags[next].find("BE") > -1) \
                       and words[next] in BE and next not in positions_matched: 
                        matched += 1 
                        positions_matched.append(next)
                        found = True
                elif ctag == "HAVE":
                    if next < len(tags) and (tags[next].find("VB") > -1 or tags[next].find("HV") > -1)\
                       and words[next] in HAVE and next not in positions_matched:
                            matched += 1 
                            positions_matched.append(next)  
                            found = True                 
                elif next < len(tags) and tags[next].find(ctag) > -1\
                     and next not in positions_matched: 
                    matched += 1
                    positions_matched.append(next)
                    found = True
                else:
                    found = True
                inner += 1
                
        if key == "subj":
            learned_pattern = ["<subj>"] 
        for pos in positions_matched:
            learned_pattern.append(words[pos])
        if key != "subj":
            learned_pattern.append("<" + key +">")
        
        learned_pattern = " ".join(learned_pattern)    
         
        if matched == len(form):
            if self.debug:
                print Tcolors.ACT + Tcolors.RED + " Form triggered: ", form, Tcolors.ENDC
                print "Pattern learned:", learned_pattern
            return True, learned_pattern
        else:
            return False, None
                
    def proccess_learned_pattern(self, pattern):
        """
            Add pattern to learned patterns if it doesn't exist else
            update its probability.
        """ 
        if pattern.find("subj") > -1:
            key = "subj"
        elif pattern.find("dobj") > -1:
            key = "dobj"
        else:
            key = "np"
        cur_subj_freq = 0
        if self.subjective:
            cur_subj_freq = 1 
        pkey = pattern
        pkey = re.sub(r"<subj> | <np>| <dobj>","",pkey) 
        if self.learned_patterns.has_key(pattern):
            subj_freq = self.learned_patterns[pattern]['subj_freq'] + cur_subj_freq
            freq = self.learned_patterns[pattern]['freq'] + 1
            prob = (float)(subj_freq)/(float)(freq)
            self.learned_patterns[pattern]['prob'] = prob
            self.learned_patterns[pattern]['subj_freq'] = subj_freq
            self.learned_patterns[pattern]['freq'] = freq
            if self.debug: print Tcolors.ADD + Tcolors.HEADER + " Updating pattern:", pattern, Tcolors.ENDC  
        else:
            subj_freq = 0
            freq = 1
            subj_freq += cur_subj_freq
            prob = (float)(subj_freq)/(float)(freq)
            self.learned_patterns[pattern] = {'type': key,
                                           'display': pkey,
                                           'freq' : freq,
                                           'subj_freq' : subj_freq,
                                           'prob' : prob}  
            if self.debug: print Tcolors.ADD + Tcolors.CYAN + " Learning pattern:", pattern, Tcolors.ENDC  
            
    def store_knowledge(self): 
        """
            Stored learned patterns for future usage.
        """
        output = open(self.filename, 'wb')
        pickle.dump(self.learned_patterns, output)
        
        
    def trigger_patterns(self, tags, words):
        """
            Method that triggers syntactic forms and returns the learned 
            patterns from the triggering.
        """   
        patterns = []
        if self.debug: print Tcolors.ACT + " Triggering subjective syntactic forms..." 
        for key in self.syntactic_forms.keys():
            syntactic_forms = self.syntactic_forms[key]
            if self.debug: print Tcolors.PROC + Tcolors.GRAY + " Checking form group " + key + "..." + Tcolors.ENDC
            
            for form in syntactic_forms:  
                for i,tag in enumerate(tags): 
                    if tag.find("NN") > -1 or tag.find("NP") > -1 \
                       or tag.find("PR") > -1: 
                        triggered, pattern = self.match_until_next_nn(i, tags, words, form, key) 
                        if pattern is not None and pattern not in patterns:
                            if self.debug: print Tcolors.ACT + Tcolors.RED + " Form triggered: ", form, Tcolors.ENDC
                            patterns.append(pattern)
        for pattern in patterns:
            self.proccess_learned_pattern(pattern)
        if self.debug:
            print Tcolors.OKBLUE
            print self.learned_patterns    
            print Tcolors.ENDC
        self.store_knowledge()

    def train(self, data):
        """
            Method to train the pattern-based classifier
        """
        for sentence in data:
            self.classify(sentence)  
    
    def clear_learned_data(self):
        self.learned_patterns = {}

                  
if __name__ == "__main__":
    from hp_classifiers import HpObj, HpSubj
    from pos import SequentialTagger
    hp_obj = HpObj()
    hp_subj = HpSubj()  
    tagger = SequentialTagger()
    bootstrapping = Bootstrapping(hp_obj, hp_subj, tagger)
    if self.debug:
        print bootstrapping.classify(sys.argv[1])

ImportError: No module named terminal_colors

In [4]:
class Tcolors:
    HEADER = '\033[1;95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[1;92m'
    WARNING = '\033[1;93m'
    W = '\033[1;37m'
    GRAY = W
    BGGRAY = '\033[1;37;40m'
    BG = '\033[1;30;47m' 
    BGH = '\033[1;40;41m' 
    FAIL = '\033[91m'
    RED = '\033[1;91m'
    ENDC = '\033[0m'
    CYAN = '\033[1;36m'
    INF = '\033[1;90m'
    C = ENDC 
    
    ACT = W + "["+RED+"*"+ENDC+W+"]" + C
    PROC = W + "["+OKBLUE+"*"+ENDC+W+"]" + C
    ADD = W + "["+WARNING+"+"+ENDC+W+"]" + C
    RES = W + "["+OKGREEN+"x"+ENDC+W+"]" + C
    INFO = W + "["+OKBLUE+"INFO:"+ENDC+W+"]" + C
    OK = W + "[ "+ OKGREEN + "OK" + ENDC+W+ " ]" + C
    def disable(self):
        self.HEADER = ''
        self.OKBLUE = ''
        self.OKGREEN = ''
        self.WARNING = ''
        self.FAIL = ''
        self.ENDC = ''

In [6]:
import nltk
import sys
import os
import pickle
import numpy as np
from PyML import svm, ker, featsel
from PyML.containers.vectorDatasets import SparseDataSet, VectorDataSet
from PyML.classifiers.composite import Chain, FeatureSelect 
from scrapy.conf import settings
from terminal_colors import Tcolors
from PyML.classifiers.svm import loadSVM

class SvmClassifier:
    """
    SVM classifier: Performing training and prediction of sentiment class.
    """
    def __init__(self, lexicon, C=1, num_features=100):
        self.training_set = None
        self.classes = None 
        self.test_set = None
        self.results = None
        self.kernel = ker.Linear()
        self.C = C  
        self.feature_data = PATH + "/learning/stored/feature.data"
        self.label_data = PATH + "/learning/stored/svm_label.data"
        self.lexicon = lexicon
        self.num_features = len(self.lexicon.words.keys())
        try:
            print "Loading existing SVM..."
            features = pickle.load(open(self.feature_data))
            labels = pickle.load(open(self.label_data))
            sparsedata = SparseDataSet(features, L=labels) 
            self.svm_classifier = loadSVM(PATH + "/learning/stored/svm.classifier",sparsedata)
        except Exception as e:
            print e
            print "Existing SVM not found!"
            self.svm_classifier = svm.SVM(self.kernel)
        self.accuracy = None
        self.predicted_labels = None
        score = featsel.FeatureScore('golub')
        self.filter = featsel.Filter(score)
        self.feature_selector = FeatureSelect(self.svm_classifier, self.filter)
        self.chain = Chain([self.feature_selector, self.svm_classifier])
        
    def classify(self, sentences, labels):
        self.test_set = self.compute_features(sentences)
        print
        print Tcolors.ACT + " Classifying instance with SVM: " + Tcolors.RED + sentences[0] + Tcolors.C
        print Tcolors.HEADER
        test_data = SparseDataSet(self.test_set, L=labels)
        self.results = self.svm_classifier.test(test_data)
        print Tcolors.C 
        return self.results
    
    def compute_features(self, sentences): 
        features = [] 
        for i,sent in enumerate(sentences):
            sent = sent.lower()
            words = nltk.word_tokenize(sent)     
            feature = np.zeros(self.num_features) 
            for word in words:
                if word.lower() in self.lexicon.words.keys():
                    feature[self.lexicon.words.keys().index(word)] = 1
            features.append(feature) 
        return features  
    
    def initialize_lexicon(self):
        pass
    
    def print_stats(self): 
        print "[*] SVM Classifier ACCURACY: ", self.accuracy
        print "[*] SVM Classifier PREDICTED_LABEL: ", self.predicted_labels[0]
    
    def stats(self):
        self.accuracy = self.results.getSuccessRate()
        self.predicted_labels = self.results.getPredictedLabels()        
    
    def save(self,data,features,labels):
        output = open(self.feature_data ,'wb')
        pickle.dump(features,output)
        output.close()
        output = open(self.label_data,'wb')
        pickle.dump(labels,output)
        output.close()
        self.svm_classifier.save(PATH + "/learning/stored/svm.classifier")
        
    def train(self, training_set, labels):
        print Tcolors.ACT + " Training SVM with chaining..."
        features = self.compute_features(training_set) 
        data = SparseDataSet(features, L=labels) 
        print Tcolors.CYAN
        self.training_set = data 
        self.svm_classifier.train(data)     
        self.save(data,features,labels)
        print Tcolors.C

ImportError: No module named PyML

In [7]:
import nltk
import sys
import os
import pickle
import numpy as np
from PyML import svm, ker, featsel
from PyML.containers.vectorDatasets import SparseDataSet, VectorDataSet
from PyML.classifiers.composite import Chain, FeatureSelect 
from scrapy.conf import settings
from terminal_colors import Tcolors
from PyML.classifiers.svm import loadSVM

class SvmClassifier:
    """
    SVM classifier: Performing training and prediction of sentiment class.
    """
    def __init__(self, lexicon, C=1, num_features=100):
        self.training_set = None
        self.classes = None 
        self.test_set = None
        self.results = None
        self.kernel = ker.Linear()
        self.C = C  
        self.feature_data = PATH + "/learning/stored/feature.data"
        self.label_data = PATH + "/learning/stored/svm_label.data"
        self.lexicon = lexicon
        self.num_features = len(self.lexicon.words.keys())
        try:
            print "Loading existing SVM..."
            features = pickle.load(open(self.feature_data))
            labels = pickle.load(open(self.label_data))
            sparsedata = SparseDataSet(features, L=labels) 
            self.svm_classifier = loadSVM(PATH + "/learning/stored/svm.classifier",sparsedata)
        except Exception as e:
            print e
            print "Existing SVM not found!"
            self.svm_classifier = svm.SVM(self.kernel)
        self.accuracy = None
        self.predicted_labels = None
        score = featsel.FeatureScore('golub')
        self.filter = featsel.Filter(score)
        self.feature_selector = FeatureSelect(self.svm_classifier, self.filter)
        self.chain = Chain([self.feature_selector, self.svm_classifier])
        
    def classify(self, sentences, labels):
        self.test_set = self.compute_features(sentences)
        print
        print Tcolors.ACT + " Classifying instance with SVM: " + Tcolors.RED + sentences[0] + Tcolors.C
        print Tcolors.HEADER
        test_data = SparseDataSet(self.test_set, L=labels)
        self.results = self.svm_classifier.test(test_data)
        print Tcolors.C 
        return self.results
    
    def compute_features(self, sentences): 
        features = [] 
        for i,sent in enumerate(sentences):
            sent = sent.lower()
            words = nltk.word_tokenize(sent)     
            feature = np.zeros(self.num_features) 
            for word in words:
                if word.lower() in self.lexicon.words.keys():
                    feature[self.lexicon.words.keys().index(word)] = 1
            features.append(feature) 
        return features  
    
    def initialize_lexicon(self):
        pass
    
    def print_stats(self): 
        print "[*] SVM Classifier ACCURACY: ", self.accuracy
        print "[*] SVM Classifier PREDICTED_LABEL: ", self.predicted_labels[0]
    
    def stats(self):
        self.accuracy = self.results.getSuccessRate()
        self.predicted_labels = self.results.getPredictedLabels()        
    
    def save(self,data,features,labels):
        output = open(self.feature_data ,'wb')
        pickle.dump(features,output)
        output.close()
        output = open(self.label_data,'wb')
        pickle.dump(labels,output)
        output.close()
        self.svm_classifier.save(PATH + "/learning/stored/svm.classifier")
        
    def train(self, training_set, labels):
        print Tcolors.ACT + " Training SVM with chaining..."
        features = self.compute_features(training_set) 
        data = SparseDataSet(features, L=labels) 
        print Tcolors.CYAN
        self.training_set = data 
        self.svm_classifier.train(data)     
        self.save(data,features,labels)
        print Tcolors.C

ImportError: No module named PyML

In [8]:
import sys
import nltk
from lexicon import Lexicon    
from stemming.porter2 import stem

class HpSubj:
    """
        High precision subjective sentence classifier which uses an annotated 
        lexicon of words as features. It classifies a sentence as subjective 
        if it contains two or more of the strong subjective clues.
    """

    def __init__(self, debug=False): 
        self.dictionary = Lexicon().words
        self.debug = debug
        
    def classify(self, sentence):
        wdict = self.dictionary
        words = nltk.word_tokenize(sentence)
        strong_subjective_words_count = 0
        subjective = False
        for word in words: 
            word = word.lower()
            check = [word, stem(word)]
            for w in check: 
                if wdict.has_key(w) and wdict[w]['type'] == 'strongsubj':
                    strong_subjective_words_count += 1
                    if strong_subjective_words_count >= 2:
                        subjective = True
                        break
        return subjective
    

class HpObj:
    """
        High precision objective sentence classifier which uses an annotated 
        lexicon as training data. It classifies a sentence as objective if it
        doesn't contain along with its previous and next sentence, not even 
        one strong subjective clue and at most one weak subjective clue.
    """
    
    def __init__(self, debug=False): 
    
        self.lexicon = Lexicon()
        self.dictionary = self.lexicon.words
        self.debug = debug
    
    def classify(self, current, previous="", next=""):
        if self.debug:
            print 
            print "current:", current
            print "previous:", previous 
            print "next:", next
            print
        wdict = self.dictionary
        words = nltk.word_tokenize(current)
        prev_words = nltk.word_tokenize(previous)
        next_words = nltk.word_tokenize(next)
        words += prev_words + next_words
        strong_subjective_words_count = 0
        weak_subjective_words_count = 0 
        objective = True
        
        for word in words: 
            word = word.lower()
            check = [word, stem(word)]
            for w in check: 
                if wdict.has_key(w):
                    if wdict[w]['type'] == 'strongsubj':
                        strong_subjective_words_count += 1
                        if strong_subjective_words_count > 0:
                            objective = False
                            break
                    elif wdict[w]['type'] == 'weaksubj':
                        weak_subjective_words_count += 1
                        if weak_subjective_words_count > 1:
                            objective = False
                            break
        return objective
 

if __name__ == '__main__': 
    hpo = HpObj()
    hps = HpSubj()
    print "Objective: " + (str)(hpo.classify(sys.argv[1]))
    print "Subjective: " + (str)(hps.classify(sys.argv[1]))

ImportError: No module named lexicon

In [9]:
import pickle 
import os
import sys 
from datasets.emoticons_patch import patch_emoticons 

class Lexicon:
    """
        Lexicon class loads an annotated dataset of words
        that have strong/weak subjectivity and is used to 
        train the high precision objective and subjective 
        classifiers.
    """
       
    def __init__(self):
        self.filename =  "stored/lexicon"
        try:
            self.words = pickle.load(open(self.filename))
        except:
            self.words = {}
            self.load()
            output = open(self.filename, 'wb')
            pickle.dump(self.words, output)
        
    def load(self):
        """
            The method loads the annotated corpus and extracts the structure
            with easy access for the classifiers.
        """
        dictionary_file = open("datasets/subjclueslen1-HLTEMNLP05.tff","r")
        lines = dictionary_file.readlines()

        for line in lines:
            attributes = line.split(" ")
            for index,attr in enumerate(attributes):
                if attr.find('word1') > -1:
                    word_value = attr.split("=")[1]
                    attributes[index] = []
                    break
            if self.words.has_key(word_value):
                for attr in attributes:
                    if attr != []:
                        arr = attr.split("=")
                        key = arr[0]
                        if key == "pos1":  
                            pos = self.words[word_value][key]
                            self.words[word_value][key].append(arr[1])
                            break
            else:
                self.words[word_value] = {}
                for attr in attributes:
                    if attr != []:
                        arr = attr.split("=")
                        key = arr[0]
                        if len(arr) > 1:
                            value = arr[1]
                        if key == "pos1":
                            self.words[word_value][key] = [value.replace("\n", "")]
                        else:
                            self.words[word_value][key] = value.replace("\n", "") 
        
        self.words = dict(patch_emoticons(), **self.words)      

ImportError: No module named datasets.emoticons_patch