# Text Classification System (Naive Bayes classifier)

In [None]:
import numpy as np
import re
import emoji
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
df = pd.read_csv('cleanprocessed_sampled_tweets.csv')

In [None]:
def strip_emoji(text):
    return emoji.replace_emoji(text, replace=' ')

In [None]:
# Convert text to lowercase and replace non-alphanumeric characters and the word 'url' with a space
df["text"] = df["text"].str.lower().str.replace("([^0-9A-Za-z \t])|\burl\b", " ", case=False, regex=True)

# Drop duplicate entries based on the 'text' column
df = df.drop_duplicates("text")

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    t = row["text"]  # Get the original text
    cleaned_text = strip_emoji(t)  # Remove emojis
    # Update the processed text back to the original 'text' column
    df.at[index, "text"] = cleaned_text

# Print the number of unique reviews
print(df.shape[0])  # Number of unique reviews

In [None]:
import scipy.stats as stats
# Plot the distribution
class_names = ['Trump', 'Harris', 'Others']
ax = sns.countplot(x=df.VoteWho)
plt.xlabel('tweet support')
ax.set_xticklabels(class_names)

In [None]:
df['VoteWho'] = df['VoteWho'].replace({
    1: 'Trump',
    2: 'Harris',
    3: 'Others'
})
RANDOM_SEED = 42
df_train, df_test = train_test_split(df[['text','VoteWho']], test_size=0.2, random_state=RANDOM_SEED)
df_train.to_csv('train_support.csv', index=False, header=False, encoding='utf-8-sig')
df_test.to_csv('test_support.csv', index=False, header=False, encoding='utf-8-sig')

In [None]:
from textblob.classifiers import NaiveBayesClassifier

# reading the train csv and train the classifier
with open('train_support.csv', 'r', encoding='utf-8-sig') as f:
  cl = NaiveBayesClassifier(f, format='csv')

# test the accuracy of the classifier on the test csv
with open('test_support.csv', 'r', encoding='utf-8-sig') as f:
  print(cl.accuracy(f, format='csv'))

# classify a text using the classifier
text = "I feel like I am missing something here."

print(cl.classify(text))
# prints physics

# check the informative features
print(cl.show_informative_features(20))

In [None]:
# Read the data to be classified
processed_cleaned_data = pd.read_csv('processed_cleaned_data.csv')

# Define a function for classification
def classify_text(text):
    return cl.classify(text)

# Classify each row in the 'text' column and store the results in the 'support' column
processed_cleaned_data['support'] = processed_cleaned_data['text'].apply(classify_text)

# Save the results to a new CSV file
processed_cleaned_data.to_csv('classified_data_bayes.csv', index=False, encoding='utf-8-sig')

print("Classification completed and saved as 'classified_data_bayes.csv'")