In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the 20 Newsgroups dataset
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Convert the text data into numerical features
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(data.data)
y = data.target

# Split the data into training and testing sets
train_size = int(0.8 * len(data.data))
X_train = X[:train_size]
y_train = y[:train_size]
X_test = X[train_size:]
y_test = y[train_size:]

# Train a Decision Tree classifier on the training data
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Test the classifier on the testing data and compute accuracy
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)


Accuracy: 0.4960212201591512


In this example, we're using the DecisionTreeClassifier class from scikit-learn's tree module to train a decision tree on the 20 Newsgroups dataset. We're using CountVectorizer from the feature_extraction.text module to convert the text data into numerical features. We're also using accuracy_score from the metrics module to compute the accuracy of the classifier on the testing data.

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Load the 20 Newsgroups dataset
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Convert the text data into numerical features
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data.data)
y = data.target

# Split the data into training and testing sets
train_size = int(0.8 * len(data.data))
X_train = X[:train_size]
y_train = y[:train_size]
X_test = X[train_size:]
y_test = y[train_size:]

# Train a Support Vector Machine classifier on the training data
clf = LinearSVC()
clf.fit(X_train, y_train)

# Test the classifier on the testing data and compute accuracy
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)


Accuracy: 0.7618037135278515


In this example, we're using the LinearSVC class from scikit-learn's svm module to train a support vector machine on the 20 Newsgroups dataset. We're using TfidfVectorizer from the feature_extraction.text module to convert the text data into numerical features using term frequency-inverse document frequency (TF-IDF) weighting. We're also using accuracy_score from the metrics module to compute the accuracy of the classifier on the testing data.