In [1]:
import pandas as pd

In [7]:
df = pd.read_csv("twitter.csv", usecols=range(0,4))
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [28]:
df["Sentiment"].value_counts()

Sentiment
1    790178
0    788436
Name: count, dtype: int64

Sentiment source is useless column

In [9]:
df = df.drop(["SentimentSource"], axis=1)
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


### Preprocessing

In [10]:
df_pre = df.copy()

#### Lowercase

In [12]:
df_pre["SentimentText"] = df_pre["SentimentText"].apply(lambda x: x.lower())
df_pre.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl frie...
1,2,0,i missed the new moon trail...
2,3,1,omg its already 7:30 :o
3,4,0,.. omgaga. im sooo im gunna cry. i'...
4,5,0,i think mi bf is cheating on me!!! ...


#### Remove punctuation

In [13]:
import string

df_pre["SentimentText"] = df_pre["SentimentText"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df_pre.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend
1,2,0,i missed the new moon trailer
2,3,1,omg its already 730 o
3,4,0,omgaga im sooo im gunna cry ive be...
4,5,0,i think mi bf is cheating on me tt


#### Tokenization & removing stopwords & lemmatization

In [14]:
import nltk

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prixe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prixe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prixe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

for i, row in df_pre.iterrows():
    # Tokenization
    tokens = word_tokenize(row["SentimentText"])
    # removing stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # preprocessed text
    df_pre.loc[i, "SentimentText"] = " ".join(lemmatized_tokens)
    if i % 10000 == 0:
        print(i, end=" ")


0 10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 110000 120000 130000 140000 150000 160000 170000 180000 190000 200000 210000 220000 230000 240000 250000 260000 270000 280000 290000 300000 310000 320000 330000 340000 350000 360000 370000 380000 390000 400000 410000 420000 430000 440000 450000 460000 470000 480000 490000 500000 510000 520000 530000 540000 550000 560000 570000 580000 590000 600000 610000 620000 630000 640000 650000 660000 670000 680000 690000 700000 710000 720000 730000 740000 750000 760000 770000 780000 790000 800000 810000 820000 830000 840000 850000 860000 870000 880000 890000 900000 910000 920000 930000 940000 950000 960000 970000 980000 990000 1000000 1010000 1020000 1030000 1040000 1050000 1060000 1070000 1080000 1090000 1100000 1110000 1120000 1130000 1140000 1150000 1160000 1170000 1180000 1190000 1200000 1210000 1220000 1230000 1240000 1250000 1260000 1270000 1280000 1290000 1300000 1310000 1320000 1330000 1340000 1350000 1360000 1370000 1380000 13

### Sentiment analysis

In [18]:
df_pre.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,sad apl friend
1,2,0,missed new moon trailer
2,3,1,omg already 730
3,4,0,omgaga im sooo im gunna cry ive dentist since ...
4,5,0,think mi bf cheating tt


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(df['SentimentText'], df['Sentiment'], test_size=0.2, random_state=42)

In [21]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [25]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [26]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{classification_rep}")

Accuracy: 0.7702036278636653
Confusion Matrix:
[[122174  35749]
 [ 36803 120997]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77    157923
           1       0.77      0.77      0.77    157800

    accuracy                           0.77    315723
   macro avg       0.77      0.77      0.77    315723
weighted avg       0.77      0.77      0.77    315723

