In [1]:
def to_lowercase(text):
    return text.lower()

# Contoh penggunaan
sample_text = "This is an EXAMPLE Sentence."
print(to_lowercase(sample_text))


this is an example sentence.


In [2]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Contoh penggunaan
sample_text = "Hello, World!"
print(remove_punctuation(sample_text))


Hello World


In [3]:
import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Contoh penggunaan
sample_text = "There are 2 apples and 10 oranges."
print(remove_numbers(sample_text))


There are  apples and  oranges.


In [None]:
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt_tab')

def tokenize(text):
    return word_tokenize(text)

# Contoh penggunaan
sample_text = "This is an example sentence."
print(tokenize(sample_text))


['This', 'is', 'an', 'example', 'sentence', '.']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [71]:

from nltk.corpus import stopwords

nltk.download('stopwords')



def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

# Contoh penggunaan
sample_text = "This is an example showing stopwords removal."
tokenized_text = (sample_text)

print(remove_stopwords(tokenized_text))


['T', 'h', ' ', ' ', 'n', ' ', 'e', 'x', 'p', 'l', 'e', ' ', 'h', 'w', 'n', 'g', ' ', 'p', 'w', 'r', ' ', 'r', 'e', 'v', 'l', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from nltk.stem import PorterStemmer

def stem_words(words):
    ps = PorterStemmer()
    return [ps.stem(word) for word in words]

# Contoh penggunaan
sample_text = "This is an example showing stemming of words."
tokenized_text = tokenize(sample_text)
filtered_words = remove_stopwords(tokenized_text)
print(stem_words(filtered_words))


['thi', 'exampl', 'show', 'stem', 'word', '.']


In [6]:
sentence1 = "I love football"
sentence2 = "Messi is a great football player"
sentence3 = "Messi has won seven Ballon d’Or awards "

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
docs = [sentence1, sentence2, sentence3]
print(docs)

['I love football', 'Messi is a great football player', 'Messi has won seven Ballon d’Or awards ']


In [8]:
#Mendefinisikan dan menyesuaikan count vectorizer pada dokumen.

vec = CountVectorizer()
X = vec.fit_transform(docs)
#Mengonversi vektor pada DataFrame menggunakan pandas

df = pd.DataFrame(X.toarray(),
    columns=vec.get_feature_names_out())
df.head()

Unnamed: 0,awards,ballon,football,great,has,is,love,messi,or,player,seven,won
0,0,0,1,0,0,0,1,0,0,0,0,0
1,0,0,1,1,0,1,0,1,0,1,0,0
2,1,1,0,0,1,0,0,1,1,0,1,1


In [9]:
import numpy as np
from collections import Counter
from math import log

# Tiga dokumen dalam korpus
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The lazy dog sleeps in the sun"
    ]

# Preprocessing: Lowercasing and tokenizing
tokenized_documents = [doc.lower().split() for doc in documents]

# Menghitung TF
def compute_tf(tokenized_doc):
    tf_dict = {}
    term_count = Counter(tokenized_doc)
    total_terms = len(tokenized_doc)
    for term, count in term_count.items():
        tf_dict[term] = count / total_terms
    return tf_dict

tf_list = [compute_tf(doc) for doc in tokenized_documents]

print("Term Frequency (TF):")
for idx, tf in enumerate(tf_list):
    print(f"Document {idx + 1} TF:")
    for term, score in tf.items():
        print(f"    {term}: {score:.4f}")


Term Frequency (TF):
Document 1 TF:
    the: 0.2222
    quick: 0.1111
    brown: 0.1111
    fox: 0.1111
    jumps: 0.1111
    over: 0.1111
    lazy: 0.1111
    dog: 0.1111
Document 2 TF:
    the: 0.2857
    lazy: 0.1429
    dog: 0.1429
    sleeps: 0.1429
    in: 0.1429
    sun: 0.1429


In [10]:
# Menghitung IDF
def compute_idf(tokenized_docs):
    idf_dict = {}
    total_docs = len(tokenized_docs)
    all_terms = set(term for doc in tokenized_docs for term in doc)
    for term in all_terms:
        doc_containing_term = sum(1 for doc in tokenized_docs if term in doc)
        idf_dict[term] = log(total_docs / (1 + doc_containing_term)) + 1
    return idf_dict

idf_dict = compute_idf(tokenized_documents)

print("\nInverse Document Frequency (IDF):")
for term, score in idf_dict.items():
    print(f"    {term}: {score:.4f}")



Inverse Document Frequency (IDF):
    the: 0.5945
    brown: 1.0000
    in: 1.0000
    sun: 1.0000
    jumps: 1.0000
    over: 1.0000
    dog: 0.5945
    quick: 1.0000
    fox: 1.0000
    lazy: 0.5945
    sleeps: 1.0000


In [11]:
# Menghitung TF-IDF
def compute_tfidf(tf_list, idf_dict):
    tfidf_list = []
    for tf in tf_list:
        tfidf_dict = {}
        for term, tf_value in tf.items():
            tfidf_dict[term] = tf_value * idf_dict.get(term, 0)
        tfidf_list.append(tfidf_dict)
    return tfidf_list

tfidf_list = compute_tfidf(tf_list, idf_dict)

print("\nTF-IDF:")
for idx, tfidf in enumerate(tfidf_list):
    print(f"Document {idx + 1} TF-IDF:")
    for term, score in tfidf.items():
        print(f"    {term}: {score:.4f}")


TF-IDF:
Document 1 TF-IDF:
    the: 0.1321
    quick: 0.1111
    brown: 0.1111
    fox: 0.1111
    jumps: 0.1111
    over: 0.1111
    lazy: 0.0661
    dog: 0.0661
Document 2 TF-IDF:
    the: 0.1699
    lazy: 0.0849
    dog: 0.0849
    sleeps: 0.1429
    in: 0.1429
    sun: 0.1429


In [75]:
pip install --force-reinstall --no-cache-dir gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Downloading gensim-4.3.3-cp3

In [4]:
from gensim.models import Word2Vec
import numpy as np

corpus = [
    'Iam the bones of my sword.',
    'Steel is my body and fire is my blood.',
    'I created over thousand blade.',
    'Unknown to live and unknown to death',
    'So as i pray unlimited blade works'
]

sentences = [doc.split() for doc in corpus]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(doc):
    return np.mean([model.wv[word] for word in doc.split() if word in model.wv], axis=0)

doc_vectors = [document_vector(doc) for doc in corpus]
print(doc_vectors)


[array([ 1.5627211e-03, -1.1602900e-03, -5.1071402e-06,  6.7668105e-03,
       -2.5523729e-03, -4.8593534e-04,  4.6453779e-03,  1.1680244e-04,
       -3.4880992e-03, -9.8174333e-04,  2.3950818e-03, -1.7119929e-03,
        2.7487848e-03,  3.3323476e-03, -2.1294125e-03,  2.7878706e-03,
        4.0636286e-03, -1.1773941e-03, -1.9629945e-03, -6.6884183e-03,
        3.1574790e-03,  3.8410861e-03,  4.5802752e-03, -1.0814277e-03,
        2.5968598e-03, -5.8973831e-04, -3.1036965e-03,  1.3388920e-03,
       -2.1771721e-03, -1.2336334e-03,  2.8191637e-03,  2.3639072e-03,
        4.5969966e-03, -1.1092502e-04, -4.3496639e-03,  1.7287437e-03,
       -7.2378200e-05, -8.7980129e-04, -2.4487821e-03, -3.1367058e-03,
        5.1215367e-05,  2.6666916e-03, -1.7266670e-04, -3.6295217e-03,
        2.2928494e-03,  1.4156488e-03, -5.1053343e-03,  1.7396617e-04,
        1.7904291e-03,  3.3844991e-03, -4.0396047e-03,  4.3493346e-04,
       -1.9491910e-03, -2.6039252e-04,  1.0006939e-03, -3.2436412e-03,
     

In [None]:
!pip install numpy pandas scikit-learn




Klasifikasi teks dengan Machine Learning.
This dataset is a collection newsgroup documents. The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering.


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

# 1. Mengumpulkan data
newsgroups = fetch_20newsgroups(subset='all')

# 2. Preprocessing data
# Tidak perlu preprocessing khusus karena kita akan menggunakan TfidfVectorizer

# 3. Membagi data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# 4. Melatih model
# Membuat pipeline yang mencakup TfidfVectorizer dan MultinomialNB
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Melatih model menggunakan training set
model.fit(X_train, y_train)

# 5. Mengevaluasi model
# Prediksi pada testing set
y_pred = model.predict(X_test)

# Evaluasi kinerja model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names=newsgroups.target_names))

# Confusion Matrix
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_pred))


Accuracy: 0.8425297113752123
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.88      0.72      0.79       198
           comp.graphics       0.86      0.79      0.82       245
 comp.os.ms-windows.misc       0.88      0.83      0.85       242
comp.sys.ibm.pc.hardware       0.66      0.86      0.75       238
   comp.sys.mac.hardware       0.95      0.84      0.89       250
          comp.windows.x       0.96      0.80      0.87       260
            misc.forsale       0.96      0.66      0.78       241
               rec.autos       0.89      0.93      0.91       244
         rec.motorcycles       0.91      0.95      0.93       219
      rec.sport.baseball       0.96      0.94      0.95       261
        rec.sport.hockey       0.90      0.98      0.94       245
               sci.crypt       0.78      0.98      0.87       251
         sci.electronics       0.92      0.80      0.86       249
                 sci.me