In [1]:
import numpy as np

from nltk.sentiment.util import *

from nltk.corpus import opinion_lexicon

from nltk.corpus import TwitterCorpusReader
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import random
import pickle
from gaft import GAEngine
from gaft.components import BinaryIndividual
from gaft.components import Population
from gaft.operators import TournamentSelection
from gaft.operators import UniformCrossover
from gaft.operators import FlipBitMutation

import language_check
tool = language_check.LanguageTool('en-US')

In [2]:
# Analysis plugin base class.
from gaft.plugin_interfaces.analysis import OnTheFlyAnalysis

# Built-in best fitness analysis.
from gaft.analysis.fitness_store import FitnessStore

In [3]:
#=========================================================================

In [4]:
#data for training
#tweet data source (please follow https://www.nltk.org/data.html to download data)
from nltk.corpus import twitter_samples

#load
neg_tweet = twitter_samples.strings(fileids = 'negative_tweets.json')
pos_tweet = twitter_samples.strings(fileids = 'positive_tweets.json')

In [5]:
#tweets: list of strings ; lemmatizer: nltk Lemmatizer ; stemmer : nltk Stemmer
#remove stopword and tokenize strings. lemmatize/stemming if lemmatizer/stemmer not None
def preprocessString(tweets, lemmatizer, stemmer):
    
    #stopword
    #tweets = [tweet.lower() for tweet in tweets if tweet.lower() not in stopwords.words('english')]
    #tokenize
    tokenizer = TweetTokenizer()
    tweets = [tokenizer.tokenize(tweet) for tweet in tweets]
    #lower
    for i in range(len(tweets)):
        tweets[i] = [w.lower() for w in tweets[i] if ((w not in stopwords.words('english')) & w.isalpha())]
    
    if lemmatizer != None:
        for i in range(len(tweets)):
            #lemmatization 
            tweets[i] = [lemmatizer.lemmatize(t) for t in tweets[i]]
    if stemmer != None:
        for i in range(len(tweets)):
            #stemming 
            tweets[i] = [stemmer.stem(t) for t in tweets[i]]
    
    #Collocations, Bigrams, Trigrams
    #Chunking
            
    return tweets

In [6]:
#lemmatization
wnl = nltk.WordNetLemmatizer()
#stemming
pstemmer = nltk.PorterStemmer()

#process tweets
neg_tweetPro = preprocessString(neg_tweet, wnl, pstemmer)
pos_tweetPro = preprocessString(pos_tweet, wnl, pstemmer)
docs = neg_tweetPro + pos_tweetPro
random.shuffle(docs)

In [7]:
#vectorizer
def identity(x):
    return x
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(preprocessor=identity, tokenizer=identity)
docsX = vectorizer.fit_transform(docs).toarray()

In [8]:
docsX

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
words = vectorizer.get_feature_names()

In [10]:
#=========================================================================

In [11]:
# Define population.
numWords = len(words)
outputWordLimit = 5
rngs =  [(0, 1)] * numWords  * outputWordLimit * 2 #weight and bias
indv_template = BinaryIndividual(ranges=rngs, eps=0.01)
population = Population(indv_template=indv_template, size=100).init()


In [12]:
# Create genetic operators.
selection = TournamentSelection(tournament_size=20)
crossover = UniformCrossover(pc=0.8, pe=0.5)
mutation = FlipBitMutation(pm=0.1)

In [13]:
# Create genetic algorithm engine.
engine = GAEngine(population=population, selection=selection,
                  crossover=crossover, mutation=mutation,
                  analysis=[FitnessStore])

In [14]:
@engine.analysis_register
class ConsoleOutputAnalysis(OnTheFlyAnalysis):
    interval = 1
    master_only = True

    def register_step(self, g, population, engine):
        best_indv = population.best_indv(engine.fitness)
        msg = 'Generation: {}, best fitness: {:.3f}'.format(g, engine.ori_fmax)
        self.logger.info(msg)

    def finalize(self, population, engine):
        best_indv = population.best_indv(engine.fitness)
        x = best_indv.solution
        y = engine.ori_fmax
        msg = 'Optimal solution: ({}, {})'.format(x, y)
        self.logger.info(msg)

In [17]:
@engine.fitness_register
def fitness(indv):
    x = indv.solution
    wordChoice = np.zeros(outputWordLimit)
    for input_bag in docsX:
        for o in range(outputWordLimit):
            p_start = numWords * o * 2
            paramsw = np.array(x[p_start:p_start + numWords])
            paramsb = np.array(x[p_start + numWords: p_start + numWords * 2])
            weight = input_bag * paramsw + paramsb
            wordChoice[o] = np.argmax(weight)
    sent = " ".join([words[int(c)] for c in wordChoice])
    errors = tool.check(sent)
    return (10-min(10,len(errors)))/10

In [None]:
engine.run(ng=100)

In [None]:
#import language_check
#dir(language_check)
