In [5]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Contoh penggunaan
sample_text = "selamat pagi"
print(remove_punctuation(sample_text))


selamat pagi


In [6]:
import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Contoh penggunaan
sample_text = "There are 2 apples and 10 oranges."
print(remove_numbers(sample_text))


There are  apples and  oranges.


In [7]:
def to_lowercase(text):
    return text.lower()

# Contoh penggunaan
sample_text = "selamat pagi."
print(to_lowercase(sample_text))


selamat pagi.


In [None]:
from 'nltk'.tokenize import word_tokenize
import nltk

nltk.download('punkt_tab')

def tokenize(text):
    return word_tokenize(text)

# Contoh penggunaan
sample_text = "This is an example sentence."
print(tokenize(sample_text))


ModuleNotFoundError: No module named 'nltk'

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

# Contoh penggunaan
sample_text = "This is an example showing stopwords removal."
tokenized_text = tokenize(sample_text)
print(remove_stopwords(tokenized_text))


['This', 'example', 'showing', 'stopwords', 'removal', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from nltk.stem import PorterStemmer

def stem_words(words):
    ps = PorterStemmer()
    return [ps.stem(word) for word in words]

# Contoh penggunaan
sample_text = "This is an example showing stemming of words."
tokenized_text = tokenize(sample_text)
filtered_words = remove_stopwords(tokenized_text)
print(stem_words(filtered_words))


['thi', 'exampl', 'show', 'stem', 'word', '.']


In [None]:
sentence1 = "I love football"
sentence2 = "Messi is a great football player"
sentence3 = "Messi has won seven Ballon d’Or awards "

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
docs = [sentence1, sentence2, sentence3]
print(docs)

['I love football', 'Messi is a great football player', 'Messi has won seven Ballon d’Or awards ']


In [None]:
#Mendefinisikan dan menyesuaikan count vectorizer pada dokumen.

vec = CountVectorizer()
X = vec.fit_transform(docs)
#Mengonversi vektor pada DataFrame menggunakan pandas

df = pd.DataFrame(X.toarray(),
    columns=vec.get_feature_names_out())
df.head()

Unnamed: 0,awards,ballon,football,great,has,is,love,messi,or,player,seven,won
0,0,0,1,0,0,0,1,0,0,0,0,0
1,0,0,1,1,0,1,0,1,0,1,0,0
2,1,1,0,0,1,0,0,1,1,0,1,1


In [None]:
import numpy as np
from collections import Counter
from math import log

# Tiga dokumen dalam korpus
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The lazy dog sleeps in the sun"
    ]

# Preprocessing: Lowercasing and tokenizing
tokenized_documents = [doc.lower().split() for doc in documents]

# Menghitung TF
def compute_tf(tokenized_doc):
    tf_dict = {}
    term_count = Counter(tokenized_doc)
    total_terms = len(tokenized_doc)
    for term, count in term_count.items():
        tf_dict[term] = count / total_terms
    return tf_dict

tf_list = [compute_tf(doc) for doc in tokenized_documents]

print("Term Frequency (TF):")
for idx, tf in enumerate(tf_list):
    print(f"Document {idx + 1} TF:")
    for term, score in tf.items():
        print(f"    {term}: {score:.4f}")


Term Frequency (TF):
Document 1 TF:
    the: 0.2222
    quick: 0.1111
    brown: 0.1111
    fox: 0.1111
    jumps: 0.1111
    over: 0.1111
    lazy: 0.1111
    dog: 0.1111
Document 2 TF:
    the: 0.2857
    lazy: 0.1429
    dog: 0.1429
    sleeps: 0.1429
    in: 0.1429
    sun: 0.1429


In [None]:
# Menghitung IDF
def compute_idf(tokenized_docs):
    idf_dict = {}
    total_docs = len(tokenized_docs)
    all_terms = set(term for doc in tokenized_docs for term in doc)
    for term in all_terms:
        doc_containing_term = sum(1 for doc in tokenized_docs if term in doc)
        idf_dict[term] = log(total_docs / (1 + doc_containing_term)) + 1
    return idf_dict

idf_dict = compute_idf(tokenized_documents)

print("\nInverse Document Frequency (IDF):")
for term, score in idf_dict.items():
    print(f"    {term}: {score:.4f}")



Inverse Document Frequency (IDF):
    brown: 1.0000
    lazy: 0.5945
    fox: 1.0000
    quick: 1.0000
    in: 1.0000
    sleeps: 1.0000
    the: 0.5945
    dog: 0.5945
    sun: 1.0000
    jumps: 1.0000
    over: 1.0000


In [None]:
# Menghitung TF-IDF
def compute_tfidf(tf_list, idf_dict):
    tfidf_list = []
    for tf in tf_list:
        tfidf_dict = {}
        for term, tf_value in tf.items():
            tfidf_dict[term] = tf_value * idf_dict.get(term, 0)
        tfidf_list.append(tfidf_dict)
    return tfidf_list

tfidf_list = compute_tfidf(tf_list, idf_dict)

print("\nTF-IDF:")
for idx, tfidf in enumerate(tfidf_list):
    print(f"Document {idx + 1} TF-IDF:")
    for term, score in tfidf.items():
        print(f"    {term}: {score:.4f}")


TF-IDF:
Document 1 TF-IDF:
    the: 0.1321
    quick: 0.1111
    brown: 0.1111
    fox: 0.1111
    jumps: 0.1111
    over: 0.1111
    lazy: 0.0661
    dog: 0.0661
Document 2 TF-IDF:
    the: 0.1699
    lazy: 0.0849
    dog: 0.0849
    sleeps: 0.1429
    in: 0.1429
    sun: 0.1429


In [None]:
pip install gensim



In [None]:
# Uninstall the potentially conflicting libraries
!pip uninstall -y numpy gensim pandas scikit-learn nltk

# Reinstall the libraries. Pip will automatically try to find compatible versions.
!pip install numpy gensim pandas scikit-learn nltk

# Now try importing gensim again
from gensim.models import Word2Vec
import numpy as np

corpus = [
    'Ini adalah dokumen pertama.',
    'Dokumen kedua ini adalah contoh.',
    'Dan ini adalah dokumen ketiga.'
]

sentences = [doc.split() for doc in corpus]

# It's good practice to check if the corpus is not empty and contains lists of tokens
if sentences and all(isinstance(s, list) and s for s in sentences):
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

    def document_vector(doc):
        # Ensure words exist in the model's vocabulary
        # Preprocess the input document for consistency with how the model was trained
        processed_doc = doc.lower() # Assuming the model was trained on lowercased text
        words_in_vocab = [word for word in processed_doc.split() if word in model.wv]
        if not words_in_vocab:
            # Handle cases where no words in the document are in the vocabulary
            # Returning a vector of NaNs might be preferable for clearer debugging
            # but a zero vector is common.
            return np.zeros(model.vector_size)
        return np.mean([model.wv[word] for word in words_in_vocab], axis=0)

    doc_vectors = [document_vector(doc) for doc in corpus]
    print(doc_vectors)
else:
    print("Corpus is empty or not correctly formatted after splitting.")

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Found existing installation: pandas 2.2.3
Uninstalling pandas-2.2.3:
  Successfully uninstalled pandas-2.2.3
Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Collecting numpy
  Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp311-cp311-manylinu

[array([-4.6599745e-03,  3.9806510e-03,  1.9114142e-03,  5.6526200e-03,
        3.1190130e-03, -7.5333443e-04,  3.4238021e-03,  3.3095828e-03,
       -5.7439799e-03,  1.2399314e-04,  1.5154898e-03, -4.6827844e-03,
        2.2198728e-03,  2.7434174e-03,  3.8831220e-03,  4.2578860e-05,
        2.1247193e-03,  4.1170805e-03, -6.9772042e-03, -4.5314594e-03,
        9.0925070e-04,  1.2928164e-03,  5.1809270e-03, -6.0568438e-03,
        6.1365720e-03, -7.6482916e-04,  1.3757148e-05,  1.5500397e-03,
       -3.6960281e-03, -3.5813847e-04,  3.5609016e-03, -4.6297759e-03,
        1.7671251e-03, -1.4140781e-03, -1.8171680e-03,  9.4895152e-04,
        3.1889200e-03,  2.7076614e-03,  3.7414092e-03, -1.3430156e-03,
       -2.4245111e-03,  8.4423088e-04, -4.7280975e-03,  2.1121051e-04,
       -5.4197269e-05,  2.6140013e-03,  7.9148129e-05,  2.3835800e-03,
       -5.3164572e-04,  1.5393719e-03,  2.6096610e-04, -1.8477604e-03,
       -2.8962991e-03, -2.0742700e-03,  6.8781583e-04, -1.0468734e-03,
     

In [None]:
!pip install numpy pandas scikit-learn




Klasifikasi teks dengan Machine Learning.
This dataset is a collection newsgroup documents. The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering.


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

# 1. Mengumpulkan data
newsgroups = fetch_20newsgroups(subset='all')

# 2. Preprocessing data
# Tidak perlu preprocessing khusus karena kita akan menggunakan TfidfVectorizer

# 3. Membagi data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# 4. Melatih model
# Membuat pipeline yang mencakup TfidfVectorizer dan MultinomialNB
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Melatih model menggunakan training set
model.fit(X_train, y_train)

# 5. Mengevaluasi model
# Prediksi pada testing set
y_pred = model.predict(X_test)

# Evaluasi kinerja model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names=newsgroups.target_names))

# Confusion Matrix
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_pred))


Accuracy: 0.8425297113752123
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.88      0.72      0.79       198
           comp.graphics       0.86      0.79      0.82       245
 comp.os.ms-windows.misc       0.88      0.83      0.85       242
comp.sys.ibm.pc.hardware       0.66      0.86      0.75       238
   comp.sys.mac.hardware       0.95      0.84      0.89       250
          comp.windows.x       0.96      0.80      0.87       260
            misc.forsale       0.96      0.66      0.78       241
               rec.autos       0.89      0.93      0.91       244
         rec.motorcycles       0.91      0.95      0.93       219
      rec.sport.baseball       0.96      0.94      0.95       261
        rec.sport.hockey       0.90      0.98      0.94       245
               sci.crypt       0.78      0.98      0.87       251
         sci.electronics       0.92      0.80      0.86       249
                 sci.me