<a href="https://colab.research.google.com/github/sakrbn/saeedkarbasian/blob/NLP/04_13_Classify_text_data_using_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Load the dataset: The 20 Newsgroups dataset contains text data from 20 different categories.
newsgroups = fetch_20newsgroups(subset='all')
texts, labels = newsgroups.data, newsgroups.target

# Split the dataset into training and testing sets:
# - X_train: Training texts
# - X_test: Testing texts
# - y_train: Training labels
# - y_test: Testing labels
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Convert text data into numerical format using TF-IDF vectorization:
# TF-IDF stands for Term Frequency-Inverse Document Frequency, which is a common technique to represent text data numerically.
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)  # Fit the vectorizer on training data and transform it
X_test_tfidf = vectorizer.transform(X_test)       # Transform the test data using the same vectorizer

# Train an SVM (Support Vector Machine) model:
# SVM is a supervised learning algorithm used for classification tasks.
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X_train_tfidf, y_train)  # Train the model on the training data

# Define a test document:
# This is a sample text that we want to classify into one of the 20 categories.
test_doc = ["Natural language Processing is an interesting research area in AI"]

# Transform the test document into TF-IDF format:
# The same vectorizer used for training data is applied here to ensure consistency.
test_doc_tfidf = vectorizer.transform(test_doc)

# Predict the category of the test document using the trained SVM model:
# The model predicts the label (category) for the given test document.
test_pred = svm.predict(test_doc_tfidf)

# Display the predicted category:
# The predicted label is converted into the corresponding category name using `target_names`.
print("Predicted Label:", test_pred[0])  # Print the numerical label
print("Category Name:", newsgroups.target_names[test_pred[0]])  # Map the label to its category name

Predicted Label: 13
Category Name: sci.med
