# Preprocessing email

In [17]:
import pandas as pd

# Load data from berita_cnn.csv
df = pd.read_csv('spam.csv', encoding='latin1')

In [18]:
# Display basic information about the dataset
print("Dataset shape:", df.shape)
df

Dataset shape: (5572, 5)


Unnamed: 0,id,Text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,1,"Go until jurong point, crazy.. Available only ...",,,
1,2,Ok lar... Joking wif u oni...,,,
2,3,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,4,U dun say so early hor... U c already then say...,,,
4,5,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,5568,This is the 2nd time we have tried 2 contact u...,,,
5568,5569,Will Ì_ b going to esplanade fr home?,,,
5569,5570,"Pity, * was in mood for that. So...any other s...",,,
5570,5571,The guy did some bitching but I acted like i'd...,,,


In [19]:
print("\nDataset info:")
df.info()


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          5572 non-null   int64 
 1   Text        5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: int64(1), object(4)
memory usage: 217.8+ KB


#   PREPROCESSING

In [20]:
# Tampilkan data "isi"
df['Text']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Text, Length: 5572, dtype: object

## Hapus Missing Value dan Data Duplicat

In [21]:
# Hapus baris dengan Missing Value di 'berita'
df.dropna(subset=['Text'], inplace=True)

# Hapus data duplikat
df.drop_duplicates(inplace=True)

## Test Cleaning

In [22]:
import re

# Fungsi untuk membersihkan teks
def clean_text(text):
    # Pastikan input adalah string
    if not isinstance(text, str):
        return ""
        
    text = text.lower() # 1. Ubah ke huruf kecil
    
    # 2. Ganti karakter non-breaking space (U+00A0) dengan spasi biasa
    text = text.replace(u'\xa0', u' ')
    
    # 3. Hapus awalan kota dan sumber berita 
    # Pola: NAMA_KOTA,BANGSAONLINE.COM-
    text = re.sub(r'^\w+\s*,\s*bangsaonline\.com[–-]?\s*', '', text)
    
    # 4. Hapus semua karakter yang BUKAN huruf, angka, atau spasi
    text = re.sub(r'[^\w\s]', '', text)
    
    # 5. Hapus semua angka
    text = re.sub(r'\d+', '', text)
    
    # 6. Ganti spasi ganda/lebih menjadi satu spasi & hapus spasi di awal/akhir
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Terapkan pembersihan ke kolom 'isi'
df['cleaned_isi'] = df['Text'].apply(clean_text)

# Tampilkan DataFrame
display(df[['Text', 'cleaned_isi']].head())

Unnamed: 0,Text,cleaned_isi
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


## Tokenisasi

In [23]:
import sys

# Perintah untuk menginstal library menggunakan path Python yang sedang aktif
!{sys.executable} -m pip install nltk




[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') 

# Fungsi untuk melakukan tokenisasi
def tokenize_text(text):
    return word_tokenize(text)

# Terapkan tokenisasi ke kolom 'cleaned_isi'
df['tokenized_isi'] = df['cleaned_isi'].apply(tokenize_text)

# Tampilkan DataFrame dengan kolom hasil tokenisasi
display(df[['cleaned_isi', 'tokenized_isi']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rizky\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Rizky\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,cleaned_isi,tokenized_isi
0,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,free entry in a wkly comp to win fa cup final ...,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


## Stopword Removal

In [25]:
from nltk.corpus import stopwords
nltk.download('stopwords')

# Dapatkan Stop Word bahasa Indonesia
list_stopwords = set(stopwords.words('english'))

# Fungsi untuk menghapus stop words
def remove_stopwords(tokens):
    return [word for word in tokens if word not in list_stopwords]

# Terapkan penghapusan Stop Word ke kolom 'tokenized_isi'
df['stopwords_removed_isi'] = df['tokenized_isi'].apply(remove_stopwords)

# Tampilkan DataFrame
display(df[['tokenized_isi', 'stopwords_removed_isi']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rizky\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,tokenized_isi,stopwords_removed_isi
0,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."
1,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,"[free, entry, in, a, wkly, comp, to, win, fa, ...","[free, entry, wkly, comp, win, fa, cup, final,..."
3,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]"
4,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t..."


In [26]:
from collections import Counter

# Gabungkan semua token setelah stopword removal menjadi satu daftar
all_words_after_stopwords = [word for tokens in df['stopwords_removed_isi'] for word in tokens]

# Hitung frekuensi setiap kata
word_frequencies = Counter(all_words_after_stopwords)

# Menampilkan kata-kata yang paling umum dan frekuensinya
print("Top Most Frequent Words (Without Stemming):")
for word, frequency in word_frequencies.most_common(20): # Menampilkan 20 kata teratas
    print(f"{word}: {frequency}")

Top Most Frequent Words (Without Stemming):
u: 1143
call: 578
im: 464
get: 390
ur: 384
go: 282
dont: 279
free: 278
ok: 277
ltgt: 276
å: 274
know: 257
got: 251
like: 242
ill: 237
good: 234
come: 226
day: 211
time: 208
love: 195


In [28]:
# Buat DataFrame baru dengan isi berita asli, hasil preprocessing, dan kategori
processed_df = df[['Text', 'stopwords_removed_isi']].copy()

# Ganti nama kolom 'stopwords_removed_isi' menjadi 'hasil_preprocessing'
processed_df.rename(columns={'stopwords_removed_isi': 'hasil_preprocessing'}, inplace=True)

# Konversi frekuensi kata ke DataFrame
frequency_df = pd.DataFrame.from_dict(word_frequencies, orient='index', columns=['frequency'])
frequency_df.index.name = 'word'
frequency_df.sort_values(by='frequency', ascending=False, inplace=True)

# Simpan ke dua file CSV terpisah
processed_df.to_csv('hasil_preprocessing_emailUTS.csv', index=False, encoding='utf-8')
frequency_df.to_csv('frekuensi_kata_emailUTS.csv', encoding='utf-8')

print("Hasil preprocessing disimpan di 'hasil_preprocessing_emailUTS.csv'")
print("Frekuensi kata disimpan di 'frekuensi_kata_emailUTS.csv'")

Hasil preprocessing disimpan di 'hasil_preprocessing_emailUTS.csv'
Frekuensi kata disimpan di 'frekuensi_kata_emailUTS.csv'


In [29]:
hasil_preprocessing = "hasil_preprocessing_emailUTS.csv"  
df = pd.read_csv(hasil_preprocessing)

# Tampilkan data
df

Unnamed: 0,Text,hasil_preprocessing
0,"Go until jurong point, crazy.. Available only ...","['go', 'jurong', 'point', 'crazy', 'available'..."
1,Ok lar... Joking wif u oni...,"['ok', 'lar', 'joking', 'wif', 'u', 'oni']"
2,Free entry in 2 a wkly comp to win FA Cup fina...,"['free', 'entry', 'wkly', 'comp', 'win', 'fa',..."
3,U dun say so early hor... U c already then say...,"['u', 'dun', 'say', 'early', 'hor', 'u', 'c', ..."
4,"Nah I don't think he goes to usf, he lives aro...","['nah', 'dont', 'think', 'goes', 'usf', 'lives..."
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,"['nd', 'time', 'tried', 'contact', 'u', 'u', '..."
5568,Will Ì_ b going to esplanade fr home?,"['ì_', 'b', 'going', 'esplanade', 'fr', 'home']"
5569,"Pity, * was in mood for that. So...any other s...","['pity', 'mood', 'soany', 'suggestions']"
5570,The guy did some bitching but I acted like i'd...,"['guy', 'bitching', 'acted', 'like', 'id', 'in..."


In [30]:
frekuensi_kata = "frekuensi_kata_emailUTS.csv"  
df = pd.read_csv(frekuensi_kata)

# Tampilkan data
df

Unnamed: 0,word,frequency
0,u,1143
1,call,578
2,im,464
3,get,390
4,ur,384
...,...,...
8479,practising,1
8480,rupaul,1
8481,baskets,1
8482,dane,1
