# **Twitter NLP Draft - ODIN**

Data dikumpulkan menggunakan package *tweepy* dengan keyword *matematika*. Total tweet yang terkumpul sebanyak **2500 tweets.**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/iykra-odin/tweet_matematika.csv")
df.head()

In [None]:
df_tweet = df[['text']]

In [None]:
df_tweet

In [None]:
df_tweet['length'] = df_tweet['text'].apply(len)
df_tweet.head()

In [None]:
df_tweet['length'].plot(bins=100, kind='hist') 

In [None]:
df_tweet.describe()

In [None]:
df_tweet[df_tweet['length'] == 31]['text'].iloc[0]

# Buang hastag, link, emoji, pic

In [None]:
!pip install tweet-preprocessor

In [None]:
import preprocessor as p

In [None]:
# Sebelum diclean
df_tweet['text'][0]

In [None]:
#Setelah clean
p.clean(df_tweet['text'][0])

In [None]:
p.clean(df_tweet['text'][0]).replace('/', ' ')

In [None]:
# Ganti \n dengan spasi
df_tweet['text'] = [x.replace('\n', ' ') for x in df_tweet['text']]

In [None]:
df_tweet['tweet_pre'] = df_tweet['text'].apply(p.clean)
# Ganti / dengan spasi
df_tweet['tweet_pre'] = [x.replace('/', ' ') for x in df_tweet['tweet_pre']]

In [None]:
df_tweet['length_pre'] = df_tweet['tweet_pre'].apply(len)

In [None]:
df_tweet.head(10)

In [None]:
df_tweet.iloc[122][0]

In [None]:
df_tweet.iloc[122][2]

## Ganti kata slang

Pake regex: https://www.regular-expressions.info/wordboundaries.html

In [None]:
slang = pd.read_csv("../input/iykra-odin/colloquial-indonesian-lexicon.csv")
slang = dict(zip(slang['slang'], slang['formal']))
slang = {r"\b{}\b".format(k): v for k, v in slang.items()}
slang

In [None]:
df_tweet['tweet_form'] = df_tweet['tweet_pre'].replace(slang, regex=True)

In [None]:
df_tweet.head(10)

In [None]:
words_lst_before = []
words_lst_after = []
for x in range(len(df_tweet)):
    words = df_tweet['tweet_form'][x].split()
    words = [x.lower() for x in words]
    words_lst_after.extend(words)
    
    words = df_tweet['tweet_pre'][x].split()
    words = [x.lower() for x in words]
    words_lst_before.extend(words)
    
print("Total unique vocab sebelum koreksi slang:", len(pd.DataFrame(words_lst_before).value_counts()))
print("Total unique vocab setelah koreksi slang:", len(pd.DataFrame(words_lst_after).value_counts()))

## Buang tanda baca dan stopwords

In [None]:
import string
string.punctuation

In [None]:
!pip install Sastrawi

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()
print(stopwords)

cita-cita -> citacita, harusnya cita cita

bisa-bisanya -> bisabisanya, harusnya bisa bisanya

Saya mau pergi ke pasar.Tapi -> pasartapi

In [None]:
message = 'bisa-bisanya'
blk = ''
for char in message:
    if char not in string.punctuation:
        blk += char
    else:
        blk += " "
print(blk)

In [None]:
def message_cleaning(message):
    # test_punc_removed = [char for char in message if char not in string.punctuation]
    test_punc_removed = ''
    for char in message:
        if char not in string.punctuation:
            test_punc_removed += char
        else:
            test_punc_removed += " "
            
    test_punc_removed_join = ''.join(test_punc_removed)
    test_punc_removed_join_clean = [word for word in test_punc_removed_join.split() if word.lower() not in stopwords]
    test_punc_removed_join_clean = " ".join(test_punc_removed_join_clean)
    return test_punc_removed_join_clean

In [None]:
df_tweet['tweet_clean'] = df_tweet['tweet_form'].apply(message_cleaning)

In [None]:
df_tweet.head()

In [None]:
df_tweet['length_clean'] = df_tweet['tweet_clean'].apply(len)

In [None]:
df_tweet['length_clean'].plot(bins=100, kind='hist') 

In [None]:
df_tweet['text'][124]

In [None]:
df_tweet['tweet_pre'][124]

In [None]:
df_tweet['tweet_form'][124]

In [None]:
df_tweet['tweet_clean'][124]

In [None]:
words_lst_clean = []
for x in range(len(df_tweet)):
    words = df_tweet['tweet_clean'][x].split()
    words = [x.lower() for x in words]
    words_lst_clean.extend(words)
    
print("Total unique vocab sebelum koreksi slang:", len(pd.DataFrame(words_lst_before).value_counts()))
print("Total unique vocab setelah koreksi slang:", len(pd.DataFrame(words_lst_after).value_counts()))
print("Total unique vocab setelah buang punctuation dan stopwords:", len(pd.DataFrame(words_lst_clean).value_counts()))

In [None]:
df_tweet['tweet_regex'] = df_tweet['tweet_clean'].astype(str).str.replace(r'([a-zA-Z])\1+', r'\1')

words_lst_regex = []
for x in range(len(df_tweet)):
    words = df_tweet['tweet_regex'][x].split()
    words = [x.lower() for x in words]
    words_lst_regex.extend(words)
    
print("Total unique vocab sebelum koreksi slang:", len(pd.DataFrame(words_lst_before).value_counts()))
print("Total unique vocab setelah koreksi slang:", len(pd.DataFrame(words_lst_after).value_counts()))
print("Total unique vocab setelah buang punctuation dan stopwords:", len(pd.DataFrame(words_lst_clean).value_counts()))
print("Total unique regex:", len(pd.DataFrame(words_lst_regex).value_counts()))

# Count vec

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
corpus_1 = [x.lower() for x in df_tweet['tweet_regex']]

In [None]:
import regex as re
pattern = r'[0-9]'
new_corpus = [re.sub(pattern, '', x) for x in corpus_1]

In [None]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(new_corpus)
print(bow_model)

In [None]:
vectorizer.get_feature_names()

In [None]:
bow_model_df = pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names()) 
bow_model_df

# td-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(new_corpus)
print(tfidf_model)

In [None]:
print(tfidf_model.toarray())

In [None]:
tfidf_model_df = pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names()) 
tfidf_model_df.iloc[190:200,600:650]

# POS Tagging

In [None]:
!pip install flair

In [None]:
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_INDONESIAN)

In [None]:
tag_type = 'upos' # bisa 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [None]:
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings
from typing import List
embedding_types: List[TokenEmbeddings] = [
 WordEmbeddings('id-crawl'),
 WordEmbeddings('id'),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger(hidden_size=128,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('resources/taggers/example-universal-pos',
    learning_rate=0.1,
    mini_batch_size=32,
    max_epochs=10)

In [None]:
from flair.data import Sentence
sentence = Sentence('saya dan dia kemarin pegi ke pasar bersama untuk membeli jeru')
tag_pos = SequenceTagger.load('resources/taggers/example-universal-pos/best-model.pt')
tag_pos.predict(sentence)
print(sentence.to_tagged_string())

In [None]:
sentence = Sentence(df_tweet['tweet_clean'][0])
tag_pos.predict(sentence)
print(sentence.to_tagged_string())

In [None]:
sentence = Sentence(df_tweet['tweet_clean'][590])
tag_pos.predict(sentence)
print(sentence.to_tagged_string())

# NER

https://yudanta.github.io/posts/train-an-indonesian-ner-from-a-blank-spacy-model/

In [None]:
import pickle
import spacy
import random
from spacy.util import minibatch, compounding
from spacy import load, displacy

In [None]:
with open('../input/iykra-odin/ner_spacy_fmt_datasets.pickle', 'rb') as f:
    ner_spacy_fmt_datasets = pickle.load(f)

In [None]:
nlp=spacy.blank("id")
nlp.add_pipe(nlp.create_pipe('ner'))
nlp.begin_training()

In [None]:
ner=nlp.get_pipe("ner")
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [None]:
for _, annotations in ner_spacy_fmt_datasets:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
        break

In [None]:
# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 5 iterations
  for iteration in range(5):

    # shuufling examples  before every iteration
    random.shuffle(ner_spacy_fmt_datasets)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(ner_spacy_fmt_datasets, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    
    print("Losses at iteration {}".format(iteration), losses)

In [None]:
doc = nlp(df_tweet['tweet_clean'][120])

print(doc.ents)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
for i in range(955,985):
  doc = nlp(df_tweet['tweet_clean'][i])
  if len(doc.ents) > 0:
    print("Isi Tweet:", df_tweet['tweet_clean'][i])
    print(doc.ents)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print(" ")