In [1]:
file_path = '/Users/shiveshkodali/Desktop/GitProjects/SentimentAnalysis/'

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [3]:
data = pd.read_csv(file_path+'Data/train.tsv',sep='\t')

In [4]:
documents = list(data['Phrase'])
labels = list(data['Sentiment'])

In [5]:
import string

def remove_punctuation(text):
    return ''.join(char for char in text if char not in string.punctuation)

In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(text):
    words = text.split()
    stop_words = set(stopwords.words('english')) 
    filtered_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shiveshkodali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import re

def remove_special_characters_and_numbers(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

In [8]:
documents = [remove_punctuation(text) for text in tqdm(documents)]

100%|███████████████████████████████| 156060/156060 [00:00<00:00, 636600.76it/s]


In [9]:
documents = [remove_stopwords(text) for text in tqdm(documents)]

100%|████████████████████████████████| 156060/156060 [00:05<00:00, 28361.83it/s]


In [10]:
documents = [remove_special_characters_and_numbers(text) for text in tqdm(documents)]

100%|██████████████████████████████| 156060/156060 [00:00<00:00, 2261778.02it/s]


In [11]:
documents = [i.lower() for i in documents]

In [12]:
data_dict = {documents[i]:labels[i] for i in range(len(documents)) if documents[i]!=''}

In [13]:
documents = list(data_dict.keys())
labels = list(data_dict.values())

In [14]:
embeddings_index = {} # empty dictionary
f = open('glove.6B.200d.txt', encoding="utf-8")

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

400000it [00:06, 58770.31it/s]

Found 400000 word vectors.





In [15]:
def embedding_vector(line):
    emb_vec = np.zeros(200)
    for word in line.split():
        if word in embeddings_index.keys():
            emb_vec+=(embeddings_index[word])
    emb_vec = np.array(emb_vec)
    return emb_vec
        

In [16]:
emb_vec = embedding_vector('my name is shivesh')

In [17]:
emb_vec.shape

(200,)

In [18]:
X = []
for line in tqdm(documents):
    X.append(embedding_vector(line))
    
X = np.array(X)
X.shape

100%|█████████████████████████████████| 84429/84429 [00:00<00:00, 323647.54it/s]


(84429, 200)

In [19]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(list(labels))

In [20]:
y.shape

(84429,)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB


# Create and train different machine learning models
models = {
    "Logistic Regression": LogisticRegression(solver='lbfgs', max_iter=2000),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes" : MultinomialNB()
}

def sentiment_analysis(model_name):
    model = models[model_name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Display results
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}\n")
    print(report)
    print("=" * 50)
    return

In [23]:
sentiment_analysis('Logistic Regression')

Model: Logistic Regression
Accuracy: 0.58

              precision    recall  f1-score   support

           0       0.44      0.11      0.18       746
           1       0.44      0.25      0.32      3052
           2       0.62      0.89      0.73      8386
           3       0.50      0.36      0.41      3669
           4       0.50      0.17      0.26      1033

    accuracy                           0.58     16886
   macro avg       0.50      0.36      0.38     16886
weighted avg       0.54      0.58      0.53     16886



In [24]:
sentiment_analysis('Decision Tree')

Model: Decision Tree
Accuracy: 0.45

              precision    recall  f1-score   support

           0       0.15      0.16      0.15       746
           1       0.29      0.29      0.29      3052
           2       0.62      0.62      0.62      8386
           3       0.32      0.31      0.32      3669
           4       0.22      0.22      0.22      1033

    accuracy                           0.45     16886
   macro avg       0.32      0.32      0.32     16886
weighted avg       0.45      0.45      0.45     16886



In [25]:
def embedding_vector(line,max_len):
    emb_vec = []
    for i in range(max_len):
        words = line.split()
        if i<len(words):
            if word in embeddings_index.keys():
                emb_vec.append(embeddings_index[word])
            else:
                emb_vec.append(np.zeros(200))
        else:
            emb_vec.append(np.zeros(200))
    emb_vec = np.array(emb_vec)
    return emb_vec.reshape(max_len*200)

In [26]:
embedding_vector('my name is shivesh', 5).shape

(1000,)

In [27]:
X = []
max_len = 5
for line in tqdm(documents):
    X.append(embedding_vector(line,max_len))
    
X = np.array(X)
X.shape

100%|█████████████████████████████████| 84429/84429 [00:00<00:00, 338621.83it/s]


(84429, 1000)

In [28]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(list(labels))

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
X_train.shape

(67543, 1000)

In [31]:
sentiment_analysis('Logistic Regression')

Model: Logistic Regression
Accuracy: 0.50

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       746
           1       0.00      0.00      0.00      3052
           2       0.50      1.00      0.66      8386
           3       0.00      0.00      0.00      3669
           4       0.00      0.00      0.00      1033

    accuracy                           0.50     16886
   macro avg       0.10      0.20      0.13     16886
weighted avg       0.25      0.50      0.33     16886



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
