In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pickle

# Load Dataset
data = pd.read_csv('spam.csv')  # Ganti dengan lokasi dataset

In [12]:
data = data[['Category', 'Message']]
data.columns = ['label', 'text']

In [14]:
data.columns = ['label', 'text']
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [18]:
# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [20]:
# Combined Features
class CombinedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.text_features = TextFeatures()
    
    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self
    
    def transform(self, X):
        tfidf_features = self.vectorizer.transform(X).toarray()
        text_features = self.text_features.transform(X)
        return np.hstack((tfidf_features, text_features))

In [22]:
# Split Data
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pickle

# Unduh NLTK resource jika belum tersedia
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load Dataset
data = pd.read_csv('spam.csv')  # Ganti dengan lokasi dataset

# Menggunakan kolom 'Label' sebagai target dan 'Content' sebagai teks
data.columns = ['label', 'text']
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Tambahkan fitur tambahan
class TextFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = pd.DataFrame()
        X_transformed['url_count'] = X.apply(lambda x: len(re.findall(r'http[s]?://', x)))
        X_transformed['special_char_ratio'] = X.apply(lambda x: sum(1 for c in x if not c.isalnum()) / len(x) if len(x) > 0 else 0)
        X_transformed['text_length'] = X.apply(len)
        return X_transformed

# Combined Features
class CombinedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.text_features = TextFeatures()
    
    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self
    
    def transform(self, X):
        tfidf_features = self.vectorizer.transform(X).toarray()
        text_features = self.text_features.transform(X)
        return np.hstack((tfidf_features, text_features))

# Split Data
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline dengan Grid Search
pipeline = Pipeline([
    ('features', CombinedFeatures()),
    ('svm', SVC(kernel='rbf'))
])

param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__gamma': [0.01, 0.1, 1]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

# Save Best Model
best_model = grid_search.best_estimator_
with open('svm_spam_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

print("Best Parameters:", grid_search.best_params_)
y_pred = best_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\YOGA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\YOGA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\YOGA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ........................svm__C=0.1, svm__gamma=0.01; total time=  16.1s
[CV] END ........................svm__C=0.1, svm__gamma=0.01; total time=  15.8s
[CV] END ........................svm__C=0.1, svm__gamma=0.01; total time=  16.0s
[CV] END ........................svm__C=0.1, svm__gamma=0.01; total time=  16.3s
[CV] END ........................svm__C=0.1, svm__gamma=0.01; total time=  16.5s
[CV] END .........................svm__C=0.1, svm__gamma=0.1; total time=  23.8s
[CV] END .........................svm__C=0.1, svm__gamma=0.1; total time=  26.3s
[CV] END .........................svm__C=0.1, svm__gamma=0.1; total time=  25.7s
[CV] END .........................svm__C=0.1, svm__gamma=0.1; total time=  25.8s
[CV] END .........................svm__C=0.1, svm__gamma=0.1; total time=  27.0s
[CV] END ...........................svm__C=0.1, svm__gamma=1; total time=  42.3s
[CV] END ...........................svm__C=0.1, s