In [1]:
pip install nltk scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
import random
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('stopwords')

# Load positive and negative tweets from the NLTK twitter_samples dataset
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# Combine positive and negative tweets into a single list
all_tweets = positive_tweets + negative_tweets

# Create labels for positive and negative tweets
labels = [1] * len(positive_tweets) + [0] * len(negative_tweets)

# Preprocess the tweets (tokenization and removal of stop words)
stop_words = set(stopwords.words('english'))

def preprocess_tweet(tweet):
    words = word_tokenize(tweet)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(words)

# Apply preprocessing to all tweets
processed_tweets = [preprocess_tweet(tweet) for tweet in all_tweets]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(processed_tweets, labels, test_size=0.2, random_state=42)

# Convert the tweets into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Support Vector Machine (SVM) classifier
classifier = SVC(kernel='linear')
classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\91637\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91637\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91637\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Accuracy: 74.85%
