In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np

In [15]:
tweet_df = pd.read_csv('tweets_dataset.csv')
tweet_df.head()

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...


In [16]:
tweet_df_clean = tweet_df.drop(columns = 'Handle')
tweet_df_clean['Party'] = tweet_df_clean['Party'].apply(lambda x: 0 if x == 'Democrat' else 1)
tweet_df_clean

Unnamed: 0,Party,Tweet
0,0,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,0,RT @WinterHavenSun: Winter Haven resident / Al...
2,0,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,0,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,0,RT @Vegalteno: Hurricane season starts on June...
...,...,...
86455,1,Check out my op-ed on need for End Executive O...
86456,1,"Yesterday, Betty &amp; I had a great time lear..."
86457,1,We are forever grateful for the service and sa...
86458,1,Happy first day of school @CobbSchools! #CobbB...


In [26]:
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.lower() 
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.replace(r'http\S+', '') 
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.replace(r'@\w+', '')
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.replace(r'#\w+', '')
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].str.replace(r'[^a-zA-Z\s]', '') 

stop_words = set(stopwords.words('english'))
tweet_df_clean['Tweet'] = tweet_df_clean['Tweet'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

X_train, X_test, y_train, y_test = train_test_split(tweet_df_clean['Tweet'], tweet_df_clean['Party'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer(ngram_range=(1, 3)) 
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

classifiers = [
    MultinomialNB(),
    LogisticRegression(max_iter=5000, solver='sag'),
    RandomForestClassifier(),
    SVC()
]

best_accuracy = 0
best_classifier = None

for classifier in classifiers:
    classifier.fit(X_train_vectorized, y_train)
    y_pred = classifier.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Model: {type(classifier).__name__}')
    print(f'Accuracy: {accuracy:.2f}')
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_classifier = classifier

print(f'Best Model: {type(best_classifier).__name__}')
print(f'Best Accuracy: {best_accuracy:.2f}')

Model: MultinomialNB
Accuracy: 0.82
Model: LogisticRegression
Accuracy: 0.80
Model: RandomForestClassifier
Accuracy: 0.76
Model: SVC
Accuracy: 0.77
Best Model: MultinomialNB
Best Accuracy: 0.82
