In [None]:
import os
import zipfile
import json
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Extracting the data
with zipfile.ZipFile('dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
# Create empty lists to hold the positive and negative tweet texts
positive_tweets = []
negative_tweets = []

In [None]:
# Loop over the files in the positive folder and extract the tweet text
for filename in os.listdir('positive/tweet'):
    with open(os.path.join('positive/tweet', filename), 'r') as f:
        tweet = json.load(f)
        text = tweet['text']
        # Clean the text by removing hyperlinks, escape sequences, etc.
        text = re.sub(r'http\S+', '', text)
        text = BeautifulSoup(text, 'html.parser').get_text()
        text = text.replace('\n', ' ')
        text = text.replace('\\', '')
        # Tokenize the text and remove stop words
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if not word in stopwords.words()]
        # Append the cleaned and tokenized text to the positive_tweets list
        positive_tweets.append(' '.join(tokens))

FileNotFoundError: ignored

In [None]:
# Loop over the files in the negative folder and extract the tweet text
for filename in os.listdir('negative/tweet'):
    with open(os.path.join('negative/tweet', filename), 'r') as f:
        tweet = json.load(f)
        text = tweet['text']
        # Clean the text by removing hyperlinks, escape sequences, etc.
        text = re.sub(r'http\S+', '', text)
        text = BeautifulSoup(text, 'html.parser').get_text()
        text = text.replace('\n', ' ')
        text = text.replace('\\', '')
        # Tokenize the text and remove stop words
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if not word in stopwords.words()]
        # Append the cleaned and tokenized text to the negative_tweets list
        negative_tweets.append(' '.join(tokens))

In [None]:
# Create a list of labels for the positive and negative tweet texts
labels = [1] * len(positive_tweets) + [0] * len(negative_tweets)

# Concatenate the positive and negative tweet texts into a single list
all_tweets = positive_tweets + negative_tweets

# Calculate TF-IDF scores for the tweet texts
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_tweets)

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, labels, test_size=0.2, random_state=42)

# X_train and X_test contain the TF-IDF vectors for the training and testing data
# y_train and y_test contain the corresponding labels (0 for negative, 1 for positive)

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Train the Naive Bayes classifier
nvb = MultinomialNB()
nvb.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nvb.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
#print classification report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn import svm

# Train an SVM classifier
sv_m = svm.SVC(kernel='linear', C=1, gamma='scale')
sv_m.fit(X_train, y_train)

# Test the classifier
y_pred = sv_m.predict(X_test)


In [None]:

from sklearn.metrics import accuracy_score
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# train model
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

# evaluate model
lr_accuracy = lr.score(X_test, y_test)
print("Logistic Regression Accuracy:", lr_accuracy)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# train model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# evaluate model
knn_accuracy = knn.score(X_test, y_test)
print("kNN Accuracy:", knn_accuracy)


In [None]:
#DEPLOY


# Train a Naive Bayes classifier
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)
nb_preds = nb_clf.predict(X_test)
nb_acc = accuracy_score(y_test, nb_preds)
print("Naive Bayes accuracy:", nb_acc)

# Train a SVM classifier
svm_clf = SVC(kernel='linear')
svm_clf.fit(X_train, y_train)
svm_preds = svm_clf.predict(X_test)
svm_acc = accuracy_score(y_test, svm_preds)
print("SVM accuracy:", svm_acc)

# Train a logistic regression classifier
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_preds = lr_clf.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)
print("Logistic Regression accuracy:", lr_acc)

# Train a kNN classifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_preds = knn_clf.predict(X_test)
knn_acc = accuracy_score(y_test, knn_preds)
print("kNN accuracy:", knn_acc)

# Select the best performing classifier based on accuracy
best_acc = max(nb_acc, svm_acc, lr_acc, knn_acc)
if best_acc == nb_acc:
    best_clf = nb_clf
    print("Selected Naive Bayes")
elif best_acc == svm_acc:
    best_clf = svm_clf
    print("Selected SVM")
elif best_acc == lr_acc:
    best_clf = lr_clf
    print("Selected Logistic Regression")
else:
    best_clf = knn_clf
    print("Selected kNN")

# Deploy the best performing classifier
best_clf.fit(tfidf_matrix, labels)