In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Read the preprocessed dataset
df = pd.read_csv('training_cleaned.csv', encoding='ISO-8859-1')
df.dropna(subset=['cleaned_text'], inplace=True)

# Select a subset of the dataset for training and testing
subset_df = df.sample(n=100000, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(subset_df['cleaned_text'], subset_df['sentiment'], test_size=0.2, random_state=42)

# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train and evaluate SVM model
svm = SVC(kernel='linear')
svm.fit(X_train_vectorized, y_train)

svm_accuracy = accuracy_score(y_test, svm.predict(X_test_vectorized))

# Train and evaluate Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train_vectorized, y_train)
nb_accuracy = accuracy_score(y_test, nb.predict(X_test_vectorized))

# Train and evaluate Random Forest model
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_vectorized, y_train)
rf_accuracy = accuracy_score(y_test, rf.predict(X_test_vectorized))

# Print the accuracies obtained by each algorithm
print('SVM Accuracy:', svm_accuracy)
print('Naive Bayes Accuracy:', nb_accuracy)
print('Random Forest Accuracy:', rf_accuracy)


SVM Accuracy: 0.7561
Naive Bayes Accuracy: 0.7386
Random Forest Accuracy: 0.7523


# Combining all the predicted values into one file

In [3]:
predictions = pd.DataFrame({'SVM': svm.predict(X_test_vectorized),
                            'Naive Bayes': nb.predict(X_test_vectorized),
                            'Random Forest': rf.predict(X_test_vectorized)})
predictions.to_excel('predicted_values_3_algo.xlsx', index=False)

# Predicting the majority Sentiment

In [7]:
local_df=pd.read_excel('predicted_values_3_algo.xlsx')
majority_sentiment = local_df['Majority'].tolist()
print("Accuracy: ",accuracy_score(y_test,majority_sentiment))

Accuracy:  0.75935


# Labelling our local_dataset


In [10]:
local_df = pd.read_excel('cleaned_local_dataset.xlsx')

# Vectorize the text data using the same TfidfVectorizer object used in training
X_local_test_vectorized = vectorizer.transform(local_df['Cleaned Text'])

# # Predict the sentiment of the local dataset
# y_local_pred = nb.predict(X_local_vectorized)

local_predictions = pd.DataFrame({'SVM': svm.predict(X_local_test_vectorized),
                            'Naive Bayes': nb.predict(X_local_test_vectorized),
                            'Random Forest': rf.predict(X_local_test_vectorized)})
local_predictions.to_excel('local_predicted_values_3_algo.xlsx', index=False)