In [25]:
from twitterscraper import query_tweets
import codecs, json, csv
import pandas as pd
import random, re

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.externals import joblib
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory



In [26]:
def json_to_csv():
    with codecs.open("input/twitter_test.json", 'r', 'utf-8') as f:
        tweets = json.load(f, encoding='utf-8')

    list_tweets = [list(elem.values()) for elem in tweets]
    list_columns = list(tweets[0].keys())
    df = pd.DataFrame(list_tweets, columns=list_columns)

    list_tweet = []
    for i in range(len(tweets)):
        list_tweet.append([tweets[i]['user'], tweets[i]['text']])

    df = pd.DataFrame(list_tweet, columns=['name', 'text'])
    df.to_csv("output/twitter_test.csv")

In [48]:
def load_csv(data_size):
    twitters = pd.read_csv("output/twitter_test.csv")
    username = twitters['name']
    tweets = twitters['text']

    return zip(*random.sample(list(zip(username, tweets)), data_size))

In [28]:
def load_pos_tag():
    clf = joblib.load('model/pos_tagger.joblib')
    return clf

In [29]:
def load_dict_vectorizer():
    clf = joblib.load('model/dict_vectorizer.joblib')
    return clf

In [30]:
def load_classifier():
    clf = joblib.load('model/classifier.joblib')
    return clf

In [31]:
def load_count_vectorizer():
    clf = joblib.load('model/count_vectorizer.joblib')
    return clf

In [32]:
def preprocess_twitter(raw):
    #remove hashtag, link, and @
    cleanr = re.compile("http?:\/\/.*[\r\n]*")
    cleantext = re.sub(cleanr, '', raw)
    
    cleanr = re.compile("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)*")
    cleantext = re.sub(cleanr, '', cleantext)
    
    cleantext = re.sub(cleanr, '', cleantext)
    
    return cleantext

In [33]:
def tokenizer(w):
    words = word_tokenize(w)
    return words


In [34]:
def preprocess_pos_tag(X):
    preprocess1 = []
    preprocess2 = []

    for i in range(len(X)):
        preprocess1.append(preprocess_twitter(X[i]))
    
    for i in range(len(preprocess1)):
        preprocess2.append(tokenizer(preprocess1[i]))
    
    return preprocess2
X = ["'Tolong di Kalimantan Selatan di bantu ekonomi kerakyatan pak serta swasembada pangan Thanks"]
preprocess_pos_tag(X)

[['Tolong',
  'di',
  'Kalimantan',
  'Selatan',
  'di',
  'bantu',
  'ekonomi',
  'kerakyatan',
  'pak',
  'serta',
  'swasembada',
  'pangan',
  'Thanks']]

In [35]:
def neighbor_word(sentence, i):
    if i == 0 :
        prev_word = ''
    else :
        prev_word = sentence[i-1]
    if i == len(sentence)-1 :
        next_word = ''
    else :
        next_word = sentence[i+1]
    return {'prev_word' : prev_word, 'next_word' : next_word}

def morphems(word):
    prefix_1 = word[0]
    prefix_2 = word[:2]
    prefix_3 = word[:3]
    prefix_4 = word[:4]
    suffix_1 = word[-1]
    suffix_2 = word[-2:]
    suffix_3 = word[-3:]

    return {'prefix_1' : prefix_1, 'prefix_2' : prefix_2, 'prefix_3' : prefix_3, 'prefix_4' : prefix_4, 'suffix_1' : suffix_1, 'suffix_2' : suffix_2, 'suffix_3' : suffix_3}

def has_hyphen(word):
    return {'has_hyphen' : '-' in word}

def is_digit(word):
    return {'is_digit' : word.isdigit()}

def word_case(word):
    is_capitalized = word[0].upper() == word[0]
    return {'is_capitalized' : is_capitalized}


def word_position(sentence, index):
    if (index == 0):
        pos = 0
        prev_pos = -1
        next_pos = 1
    elif (index == len(sentence)):
        pos = 2
        prev_pos = 1
        next_pos = -1
    else :
        if(index == 1):
            prev_pos = 0
            pos = 1
            next_pos = 2
        elif(index == len(sentence)-1):
            prev_pos = 1
            pos = 2
            next_pos = -1
        else :
            prev_pos = pos = next_pos = 1
    return {'prev_pos' : prev_pos, 'pos' : pos, 'next_pos' : next_pos}

In [36]:
def feature_extractor(sentences):
    X = []
    for sentence in sentences:
        for i in range(len(sentence)):
            features = {}

            word = sentence[i]
            features.update({'value': word})
            
            features.update(neighbor_word(sentence, i))
            features.update(word_position(sentence, i))
            features.update(morphems(word))
            features.update(word_case(word))
            features.update(has_hyphen(word))
            features.update(is_digit(word))
            
            X.append(features)
    return X

In [37]:
def vectorize_features(features):
    dict_vect = load_dict_vectorizer()
    features_v = dict_vect.transform(features)
    return features_v

In [38]:
def pairing_tag(tweets, tags):
    j = 0
    pair_tweet = []
    for tweet in tweets:
        pair = []
        for i in range(len(tweet)):
            pair.append((tweet[i], tags[j]))
            j = j + 1
        pair_tweet.append(pair)
    return pair_tweet

In [39]:
# Preprocessing Classifier

In [40]:
def stemWord(words):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    sentence = ''
    for word in words:
        sentence += str(stemmer.stem(word)) + ' '
    return sentence

In [41]:
def preprocess_classifier(X):
    preprocess1 = []

    for i in range(len(X)):
        preprocess1.append(stemWord(X[i]))
    
    return preprocess1

In [42]:
def word_vectorizer(X):
    count_vect = load_count_vectorizer()
    X = count_vect.transform(X)
    return X 

In [43]:
def pairing_class(username, tweets, y):
    j = 0
    pair = []
    for tweet in tweets:
        pair.append((username[j], tweet, y[j]))
        j = j + 1
    
    return pair

In [44]:
def pairing_all(pengaduan, paired_tweets):
    j = 0
    pair = []
    for tweet in pengaduan:
        pair.append((tweet, paired_tweets[j]))
        j = j + 1
    
    return pair

In [60]:
json_to_csv()

username, tweets = load_csv(10)
pre_tweets = preprocess_pos_tag(tweets)

In [61]:
# username = ['robby', 'kevin12', 'patz']
# tweets = ['@Pak Jokowi, mohon tutup tambang emas di Hutan Lindung Tumpang Pitu. http://forbanyuwangi.org/?p=60Â  #Banyuwangi #SaveTumpangPitu', 
#           'Apa tanggapan bpk @jokowi tentang video tsb ? Tolong di tindak pak....',
#           '@km_itb pilih saya menjadi presiden km berikutnya, akan saya turunkan beban sks kailan']

In [62]:
prep_tweets = preprocess_classifier(pre_tweets)


In [63]:
vector_tweets = word_vectorizer(prep_tweets)

classifier = load_classifier()

y = classifier.predict(vector_tweets)
paired_class = pairing_class(username, tweets, y)


pengaduan_list = []
pengaduan_tweet = []

for each in paired_class:
    if (each[2]==1):
        pengaduan_list.append(each)
        pengaduan_tweet.append(each[1])
if len (pengaduan_list) == 0:
    print("Tidak terdeteksi pengaduan")
    for each in paired_class:
        print(each)
else:
    pre_tweets = preprocess_pos_tag(pengaduan_tweet)
    featured_tweets = feature_extractor(pre_tweets)

    X = vectorize_features(featured_tweets)
    posTagger = load_pos_tag()

    tags = posTagger.predict(X)
    paired_tweets = pairing_tag(pre_tweets, tags)

    pair = pairing_all(pengaduan_list, paired_tweets)
    
    aduan = []
    for each in pair:
        Noun = []
        Verb = []
        Angka = []
        for i in range(len(each[1])):
            if ((each[1][i][1]=='NN') |( each[1][i][1]=='NNP' )| (each[1][i][1]=='NND') ):
                Noun.append(each[1][i][0])
            elif (each[1][i][1]=='VB' ):
                Verb.append(each[1][i][0])
            elif ((each[1][i][1]=='CD' )| (each[1][i][1]=='OD') ):
                Angka.append(each[1][i][0] + ' ' + each[1][i+1][0])
        detection = ({ 'username' : each[0][0], 
                     'Tindakan':Verb, 'Nominal': Angka,
                     'Keterangan' : Noun, 'tweet_lengkap':each[0][1]})
        aduan.append((each[0][0], Verb,  Angka, Noun, each[0][1]))

    with open('output/pengaduan.csv', 'w') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow(['username', 'Tindakan', 'Nominal', 'Keterangan', 'tweet'])
        for each in aduan:
            wr.writerow([each[0], each[1], each[2], each[3], each[4]])

aduan {'username': 'joinsputra', 'Tindakan': ['memiliki', 'menindak'], 'Nominal': [], 'Keterangan': ['bayar', 'pajak', 'badan', 'hukum', 'pt', 'mohon', 'kebijakan', 'pak', 'presiden', 'joko', 'widodo', 'utk', 'perbuatan'], 'tweet_lengkap': 'Selama ini bayar pajak & memiliki badan hukum pt, kami mohon atas kebijakan pak presiden joko widodo utk menindak tegas perbuatan itu'}
aduan {'username': 'Yayad6874', 'Tindakan': ['ada', 'siap', 'menjalankan'], 'Nominal': ['dua desa'], 'Keterangan': ['Kec', 'Tempat', 'tugas', 'desa', 'yg', 'sistim', 'padat', 'karya', 'cash', 'PD', 'mohon', 'petunjuk', 'teknis', 'petunjuk', 'pelaksaanx', 'pak'], 'tweet_lengkap': 'Di Kec. Tempat sy tugas ada dua desa yg sistim padat karya cash debagai PD siap menjalankan mohon petunjuk teknis dan petunjuk pelaksaanx pak'}
aduan {'username': 'cahmeger', 'Tindakan': [], 'Nominal': [], 'Keterangan': ['RI', 'Yth', 'Bapak', 'Presiden', 'Tolong', 'selamatkan', 'anak', 'bangsa', 'pengaruh', 'globalisasi', 'informasi'], 'twe