# Import all the necessary libraries
pandas - to load and handle data
TfidfVectorizer - to turn text into numbers


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,classification_report

In [15]:
# data load
df = pd.read_csv(r'C:\Users\Pradip Chaurel\OneDrive\Desktop\MLprojects\Twitter Sentiment Analysis\twitter_data.csv',encoding='latin-1',header=None)
df = df[[0,5]]  # keeping only two columns
df.columns = ['polarity','text']
print(df.head())

   polarity                                               text
0         0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1         0  is upset that he can't update his Facebook by ...
2         0  @Kenichan I dived many times for the ball. Man...
3         0    my whole body feels itchy and like its on fire 
4         0  @nationwideclass no, it's not behaving at all....


In [16]:
# keep only positive and negative sentiments
# removing neutral tweets where polarity is 2
# polarity 0 means negative and 4 becomes 1 for positive.

df = df[df.polarity != 2]
df['polarity'] = df['polarity'].map({0:0,4:1})
print(df['polarity'].value_counts())


0    800000
1    800000
Name: polarity, dtype: int64


In [39]:
# clean the tweets
# firstly, convert all text into lowercase for consistency

def clean_text(text):
    return text.lower()
    
df['clean_text'] = df['text'].apply(clean_text)
print(df[['text','clean_text']].head())

                                                text  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1  is upset that he can't update his Facebook by ...   
2  @Kenichan I dived many times for the ball. Man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   

                                          clean_text  
0  @switchfoot http://twitpic.com/2y1zl - awww, t...  
1  is upset that he can't update his facebook by ...  
2  @kenichan i dived many times for the ball. man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  


# split the dataset into training and testing data

In [40]:
x_train,x_test,y_train,y_test = train_test_split(
    df['clean_text'],
    df['polarity'],
    test_size=0.2,
    random_state=42
)

print("Train size:", len(x_train))
print("Test size:", len(x_test))

Train size: 1280000
Test size: 320000


In [41]:
# perform vectorization
'''This code creates a TF IDF vectorizer that converts text into
numerical features using unigrams and bigrams limited to 5000 features'''

vectorizer = TfidfVectorizer(max_features=5000,ngram_range=(1,2))

x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.fit_transform(x_test)

print("TF-IDF shape (train):", x_train_tfidf.shape)
print("TF-IDF shape (test):", x_test_tfidf.shape)

TF-IDF shape (train): (1280000, 5000)
TF-IDF shape (test): (320000, 5000)


# Train Bernoulli Naive Bayes

In [42]:
bnb = BernoulliNB()
bnb.fit(x_train_tfidf,y_train)
bnb_pred = bnb.predict(x_test_tfidf)

print("Bernoulli Naive Bayes Accuracy:", accuracy_score(y_test,bnb_pred))

print("\nBernoulliNB Classification Report:\n", classification_report(y_test,bnb_pred))

Bernoulli Naive Bayes Accuracy: 0.616253125

BernoulliNB Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.54      0.58    159494
           1       0.60      0.69      0.64    160506

    accuracy                           0.62    320000
   macro avg       0.62      0.62      0.61    320000
weighted avg       0.62      0.62      0.61    320000



# Train SVM model

In [43]:
svm = LinearSVC(max_iter=1000)
svm.fit(x_train_tfidf,y_train)

svm_pred = svm.predict(x_test_tfidf)

print("SVM Accuracy:", accuracy_score(y_test,svm_pred))

print("\nClassification Report:\n", classification_report(y_test,svm_pred))

SVM Accuracy: 0.576825

Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.61      0.59    159494
           1       0.58      0.55      0.56    160506

    accuracy                           0.58    320000
   macro avg       0.58      0.58      0.58    320000
weighted avg       0.58      0.58      0.58    320000



# Train logistic Regression

In [44]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(x_train_tfidf,y_train)

logreg_pred = logreg.predict(x_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test,logreg_pred))

print("\nLogistic Regression Classification Report:\n", classification_report(y_test,logreg_pred))

Logistic Regression Accuracy: 0.5811375

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.60      0.59    159494
           1       0.59      0.56      0.57    160506

    accuracy                           0.58    320000
   macro avg       0.58      0.58      0.58    320000
weighted avg       0.58      0.58      0.58    320000



In [53]:
# try for sample tweets
sample_tweets = ["I love this!","I hate that!","It was okay, not great."]
sample_vec = vectorizer.transform(sample_tweets)

print("\nSample Predictions:")
print("BernoulliNB:", bnb.predict(sample_vec))
print("SVM:", svm.predict(sample_vec))
print("Logistic Regression:", logreg.predict(sample_vec))


Sample Predictions:
BernoulliNB: [1 0 1]
SVM: [1 0 1]
Logistic Regression: [1 0 1]
