In [1]:
import torch
import torchtext.data as ttd
import numpy as numpy
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import random
gpu = ('cuda:0')

In [2]:
data = {
    "label" : [0,1,1],
    "data" : [
        "I Like Eggs and Ham.",
        "Eggs I Like!",
        "Ham and Eggs or Just Ham?"
    ]
}

In [3]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,label,data
0,0,I Like Eggs and Ham.
1,1,Eggs I Like!
2,1,Ham and Eggs or Just Ham?


In [4]:
df.to_csv('test.csv', index=False)

In [55]:
TEXT = ttd.Field(
    sequential=True
    ,batch_first=True
    ,lower=True
    ,tokenize='spacy'
    ,pad_first=True
)

LABEL = ttd.Field(
    sequential=False
    ,use_vocab=False
    ,is_target=True
)

In [6]:
dataset = ttd.TabularDataset(
    path='test.csv'
    ,format='csv'
    ,skip_header=True
    ,fields=[('label', LABEL), ('data', TEXT)]
)

In [7]:
ex = dataset.examples[1]

In [8]:
ex.data

['eggs', 'i', 'like', '!']

In [9]:
ex.label

'1'

In [10]:
train_dataset, test_dataset = dataset.split(0.66, random_state = random.seed(14))

In [11]:
TEXT.build_vocab(train_dataset)

In [12]:
vocab = TEXT.vocab

In [13]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f84bcf25820>>,
            {'<unk>': 0,
             '<pad>': 1,
             'ham': 2,
             'and': 3,
             'eggs': 4,
             '.': 5,
             '?': 6,
             'i': 7,
             'just': 8,
             'like': 9,
             'or': 10})

In [14]:
vocab.itos

['<unk>', '<pad>', 'ham', 'and', 'eggs', '.', '?', 'i', 'just', 'like', 'or']

In [15]:
len(vocab)

11

In [19]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset), sort_key=lambda x: len(x.data),
    batch_size=2, device=gpu
)

In [20]:
for inputs, targets in train_iter:
    print('inputs:', inputs, 'shape:', inputs.shape)
    print('targets:', targets, 'shape:', targets.shape)
    break

inputs: tensor([[ 1,  7,  9,  4,  3,  2,  5],
        [ 2,  3,  4, 10,  8,  2,  6]]) shape: torch.Size([2, 7])
targets: tensor([0, 1]) shape: torch.Size([2])


In [21]:
for inputs, targets in test_iter:
    print('inputs:', inputs, 'shape:', inputs.shape)
    print('targets:', targets, 'shape:', targets.shape)
    break

inputs: tensor([[4, 7, 9, 0]]) shape: torch.Size([1, 4])
targets: tensor([1]) shape: torch.Size([1])


In [22]:
batch_train = next(iter(train_iter))
doc_train, label_train = batch_train
batch_test = next(iter(test_iter))
doc_test, label_test = batch_test

In [23]:
doc_1 = doc_train[0] # first train doc
doc_2 = doc_train[1] # second train doc
doc_3 = doc_test[0] # first test doc
print(doc_1)
print(doc_2)
print(doc_3)

tensor([ 2,  3,  4, 10,  8,  2,  6])
tensor([1, 7, 9, 4, 3, 2, 5])
tensor([4, 7, 9, 0])


In [24]:
docs = []
for n in range (1,4):
    docs_ = []
    for i in range (len(globals()['doc_%s' % n])):
        x = globals()['doc_%s' % n][i].item()
        doc = str(TEXT.vocab.itos[x])
        docs_.append(doc)
    print(docs_)
    docs.append(docs_)

['ham', 'and', 'eggs', 'or', 'just', 'ham', '?']
['<pad>', 'i', 'like', 'eggs', 'and', 'ham', '.']
['eggs', 'i', 'like', '<unk>']


In [25]:
sentence_1 = docs[0]
sentence_1 = ' '.join(sentence_1)
sentence_2 = docs[1]
sentence_2 = ' '.join(sentence_2)
sentence_3 = docs[2]
sentence_3 = ' '.join(sentence_3)
print(sentence_1)
print(sentence_2)
print(sentence_3)

ham and eggs or just ham ?
<pad> i like eggs and ham .
eggs i like <unk>


In [133]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
from nltk import FreqDist
nltk.download('averaged_perceptron_tagger') # for pos_tag
nltk.download('maxent_ne_chunker') # for ner
nltk.download('words') # for ner

[nltk_data] Downloading package punkt to /home/rafifauzan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rafifauzan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/rafifauzan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/rafifauzan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/rafifauzan/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/rafifauzan/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [110]:
print(stopword)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [186]:
lower = [i.lower() for i in data['data']]
print(lower)
print('')

tokenize = [nltk.word_tokenize(i) for i in lower]
print(tokenize)
print('')

without_stopwords = [[j for j in tokenize[i] if j not in stopword] for i in range(len(tokenize))]
# for i in range(len(tokenize)):
#     _without_stopwords = [j for j in tokenize[i] if j not in stopword]
#     without_stopwords.append(_without_stopwords)
print(without_stopwords)
print('')

lemmatize = [[lemmatizer.lemmatize(j) for j in without_stopwords[i]] for i in range(len(without_stopwords))]
print(lemmatize)
print('')

# character_freq = [[FreqDist(j) for j in lemmatize[i]] for i in range(len(lemmatize))]
# print(character_freq)
word_freq = [FreqDist(i) for i in lemmatize]
print(word_freq)
print('')

pos_tag = [nltk.pos_tag(i) for i in lemmatize]
print(pos_tag)
print('')
# CC, A COORDINATING CONJUNCTION; RB, AN ADVERBS; IN, A PREPOSITION; NN, A NOUN; AND JJ, AN ADJECTIVE.

test = ('who is Joko Widodo, Joko Widodo is the president of Republic Indonesia, what about Susilo Bambang Yudhoyono, he is the Republic Indonesia president before Jokowi')
tok = nltk.word_tokenize(test)
pos = nltk.pos_tag(tok) 
ner = nltk.ne_chunk(pos)
NE = [" ".join(w for w, t in ele) for ele in ner if isinstance(ele, nltk.Tree)]
print (NE)
NE2 = ["_".join(w for w, t in ele) for ele in ner if isinstance(ele, nltk.Tree)]
print (NE2)

['i like eggs and ham.', 'eggs i like!', 'ham and eggs or just ham?']

[['i', 'like', 'eggs', 'and', 'ham', '.'], ['eggs', 'i', 'like', '!'], ['ham', 'and', 'eggs', 'or', 'just', 'ham', '?']]

[['like', 'eggs', 'ham', '.'], ['eggs', 'like', '!'], ['ham', 'eggs', 'ham', '?']]

[['like', 'egg', 'ham', '.'], ['egg', 'like', '!'], ['ham', 'egg', 'ham', '?']]

[FreqDist({'like': 1, 'egg': 1, 'ham': 1, '.': 1}), FreqDist({'egg': 1, 'like': 1, '!': 1}), FreqDist({'ham': 2, 'egg': 1, '?': 1})]

[[('like', 'IN'), ('egg', 'NN'), ('ham', 'NN'), ('.', '.')], [('egg', 'NN'), ('like', 'IN'), ('!', '.')], [('ham', 'NN'), ('egg', 'NN'), ('ham', 'NN'), ('?', '.')]]

['Joko Widodo', 'Joko Widodo', 'Republic', 'Indonesia', 'Susilo Bambang Yudhoyono', 'Republic', 'Indonesia', 'Jokowi']
['Joko_Widodo', 'Joko_Widodo', 'Republic', 'Indonesia', 'Susilo_Bambang_Yudhoyono', 'Republic', 'Indonesia', 'Jokowi']


In [188]:
t = test.replace(NE[4], NE2[4])
t

'who is Joko Widodo, Joko Widodo is the president of Republic Indonesia, what about Susilo_Bambang_Yudhoyono, he is the Republic Indonesia president before Jokowi'

In [231]:
b = []
for i, j in enumerate(NE):
    a = b.append(f"(NE[{i}], NE2[{i}])")
print(b)

['(NE[0], NE2[0])', '(NE[1], NE2[1])', '(NE[2], NE2[2])', '(NE[3], NE2[3])', '(NE[4], NE2[4])', '(NE[5], NE2[5])', '(NE[6], NE2[6])', '(NE[7], NE2[7])']


In [253]:
c = []
for i in range(len(NE)):
    a = f"eval(b[{i}])"
    c.append(a)
print(c)


['eval(b[0])', 'eval(b[1])', 'eval(b[2])', 'eval(b[3])', 'eval(b[4])', 'eval(b[5])', 'eval(b[6])', 'eval(b[7])']


In [248]:
s = test
for r in (eval(b[0]), eval(b[4])):
    s = s.replace(*r)
print(s)

who is Joko_Widodo, Joko_Widodo is the president of Republic Indonesia, what about Susilo_Bambang_Yudhoyono, he is the Republic Indonesia president before Jokowi


In [216]:
import re
string_a = re.sub(NE[0]|NE[1]), NE2[0]|NE2[1], test)
print(string_a)

SyntaxError: unmatched ')' (<ipython-input-216-dd9ee159a183>, line 2)

In [219]:
string_a = re.sub(r'(cat|dog)', 'pet', 'PET', "Mark owns a dog and Mary owns a cat.")
print(string_a)

TypeError: 'str' object cannot be interpreted as an integer

In [209]:
a = [] 
for i in range(len(NE)):
    b = i
    a.append(b)
print(a)

[0, 1, 2, 3, 4, 5, 6, 7]


In [214]:
a[len(a)]

IndexError: list index out of range

In [180]:
test

'who is Joko Widodo, Joko Widodo is the president of Republic Indonesia'

In [46]:
corpus_test = data['data'][1]
corpus_test

'Eggs I Like!'

In [48]:
tokenize = nltk.sent_tokenize(corpus_test)
tokenize

['Eggs I Like!']

In [50]:
tokenize_word = nltk.word_tokenize(corpus_test)
tokenize_word

['Eggs', 'I', 'Like', '!']

In [1]:
import pandas as pd

In [69]:
abc ={
    'Review' : [
        'this is good stuff love it',
        'this is so bad why did i bought it',
        'man this is awfull'
        ],
    'Label' : [1,0,0]
    }
data = pd.DataFrame(abc)
data.head()

Unnamed: 0,Review,Label
0,this is good stuff love it,1
1,this is so bad why did i bought it,0
2,man this is awfull,0


In [83]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()
BOW = bow.fit_transform(data['Review'])
print(BOW.toarray())

[[0 0 0 0 1 1 1 1 0 0 1 1 0]
 [0 1 1 1 0 1 1 0 0 1 0 1 1]
 [1 0 0 0 0 1 0 0 1 0 0 1 0]]


In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
TFIDF = tfidf.fit_transform(data['Review'])
print(TFIDF.toarray())

[[0.         0.         0.         0.         0.48359121 0.28561676
  0.36778358 0.48359121 0.         0.         0.48359121 0.28561676
  0.        ]
 [0.         0.39916886 0.39916886 0.39916886 0.         0.23575556
  0.30357821 0.         0.         0.39916886 0.         0.23575556
  0.39916886]
 [0.6088451  0.         0.         0.         0.         0.35959372
  0.         0.         0.6088451  0.         0.         0.35959372
  0.        ]]


In [75]:
tfidf.idf_

array([1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.        , 1.28768207, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.        , 1.69314718])

In [82]:
from numpy import array

from numpy.linalg import norm

a=tfidf.idf_

l2 = norm(a,1)

print(l2)

20.219153878051237


In [23]:
class animal:
    def __init__(self, breed):
        self.breed =  breed
    
    def sound(self):
        return print('Bark')

In [24]:
dog = animal('dog')

In [25]:
dog.sound()

Bark


In [26]:
class Dog(animal):
    def __init__(self, breed, name):
        super(Dog, self).__init__(breed)
        self.name = name
    
    def bark(self):
        return print('gug')
        

In [27]:
dog_1 = Dog('bulldog', 'dogy')

In [28]:
dog_1.breed

'bulldog'

In [71]:
class cube:
    def __init__(self, l, w, h, t):
        self.length = l
        self.wide = w
        self.height = h
        self.type = t
    
    @classmethod
    def test(cls):
        return print('this is a cube')
    
    def whatisthis(self):
        return cube.test()
    
    def luas(self):
        return self.length * self.wide * self.height

In [72]:
abc = cube(2, 3, 4, 'Cube')

In [73]:
abc.whatisthis()

this is a cube


In [102]:
class abc(cube):
    def __init__(self, l, w, h, t):
        super(abc, self).__init__(l, w, h, t)
    
    def contoh_1(self):
        return cube.test()

    @staticmethod
    def contoh_2():
        return print('this is static')

    def contoh(self):
        return print('contoh')

In [103]:
con = abc(4,5,6,'contoh')

In [104]:
con.contoh_1()

this is a cube


In [133]:
a = [1, 3, 4]
b = [4, 6, 7]

c = list(map(lambda x,y: x + y, a, b))

In [136]:
print(c)

[5, 9, 11]


In [132]:
d = ["sat", "sun", "fri"]

e = map(list, d)
print(list(e))

[['s', 'a', 't'], ['s', 'u', 'n'], ['f', 'r', 'i']]


In [140]:
a = [print(i) for i in range(5)]

0
1
2
3
4


In [146]:
a = 821
print('even' if a%2 == 0 else 'odd')

odd
