In [1]:
file_path = '/Users/shiveshkodali/Desktop/GitProjects/SentimentAnalysis/'

import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv(file_path+'Data/train.tsv',sep='\t')

In [3]:
documents = list(data['Phrase'])
labels = list(data['Sentiment'])

In [4]:
list(data.sample(10)['Phrase'])
#punctuations and special characters and numbers lowecasing

['into that annoying specimen of humanity',
 'Smokers Only',
 "It 's not the worst comedy of the year , but it certainly wo n't win any honors .",
 'Wesley Snipes',
 'stooping',
 'Melodrama with a message',
 ", so it 's not a brilliant piece of filmmaking , but it is a funny -LRB- sometimes hilarious -RRB- comedy with a deft sense of humor about itself , a playful spirit and a game cast .",
 'his victories',
 'two-dimensional characters',
 'mention a convincing brogue']

In [5]:
import string

def remove_punctuation(text):
    return ''.join(char for char in text if char not in string.punctuation)

In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(text):
    words = text.split()
    stop_words = set(stopwords.words('english')) 
    filtered_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shiveshkodali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import re

def remove_special_characters_and_numbers(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

In [8]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')  # Download the Punkt tokenizer models
nltk.download('wordnet')  # Download the WordNet lemmatizer data

# Initialize a stemmer and a lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Apply lemmatization to each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    # Reconstruct the text with lemmatized words
    lemmatized_text = ' '.join(lemmatized_words)
    
    return lemmatized_text

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shiveshkodali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shiveshkodali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
!pip install TextBlob
!pip install tqdm



In [10]:
from tqdm import tqdm

In [11]:
documents = [remove_punctuation(text) for text in tqdm(documents)]

100%|███████████████████████████████| 156060/156060 [00:00<00:00, 629230.88it/s]


In [12]:
documents = [remove_stopwords(text) for text in tqdm(documents)]

100%|████████████████████████████████| 156060/156060 [00:05<00:00, 28396.95it/s]


In [13]:
documents = [remove_special_characters_and_numbers(text) for text in tqdm(documents)]

100%|██████████████████████████████| 156060/156060 [00:00<00:00, 2253043.06it/s]


In [14]:
documents = [lemmatize_text(text) for text in tqdm(documents)]

100%|████████████████████████████████| 156060/156060 [00:04<00:00, 33347.78it/s]


In [15]:
data_dict = {documents[i]:labels[i] for i in range(len(documents)) if documents[i]!=''}

In [16]:
documents = data_dict.keys()
labels = data_dict.values()

In [17]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

In [18]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(list(labels))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train.shape

(66728, 14987)

In [21]:
X_test.shape

(16683, 14987)

In [22]:
y_train.shape

(66728,)

In [23]:
y_test.shape

(16683,)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB


# Create and train different machine learning models
models = {
    "Logistic Regression": LogisticRegression(solver='lbfgs', max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes" : MultinomialNB()
}

def sentiment_analysis(model_name):
    model = models[model_name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Display results
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}\n")
    print(report)
    print("=" * 50)
    return

In [25]:
sentiment_analysis('Logistic Regression')

Model: Logistic Regression
Accuracy: 0.62

              precision    recall  f1-score   support

           0       0.51      0.25      0.34       749
           1       0.52      0.35      0.42      2921
           2       0.66      0.86      0.75      8401
           3       0.55      0.42      0.48      3598
           4       0.56      0.33      0.42      1014

    accuracy                           0.62     16683
   macro avg       0.56      0.44      0.48     16683
weighted avg       0.60      0.62      0.59     16683



In [26]:
sentiment_analysis('Decision Tree')

Model: Decision Tree
Accuracy: 0.56

              precision    recall  f1-score   support

           0       0.36      0.30      0.33       749
           1       0.44      0.42      0.43      2921
           2       0.67      0.72      0.70      8401
           3       0.47      0.43      0.45      3598
           4       0.37      0.35      0.36      1014

    accuracy                           0.56     16683
   macro avg       0.46      0.44      0.45     16683
weighted avg       0.56      0.56      0.56     16683



In [27]:
sentiment_analysis('Naive Bayes')

Model: Naive Bayes
Accuracy: 0.57

              precision    recall  f1-score   support

           0       0.37      0.20      0.26       749
           1       0.46      0.42      0.44      2921
           2       0.66      0.74      0.70      8401
           3       0.48      0.50      0.49      3598
           4       0.44      0.23      0.31      1014

    accuracy                           0.57     16683
   macro avg       0.48      0.42      0.44     16683
weighted avg       0.56      0.57      0.56     16683



In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert the TF-IDF matrix to a dense matrix
X = tfidf_matrix.toarray()

# Access the feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

# Create and train different machine learning models
models = {
    "Logistic Regression": LogisticRegression(solver='lbfgs', max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes" : MultinomialNB()
}

def sentiment_analysis(model_name):
    model = models[model_name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Display results
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}\n")
    print(report)
    print("=" * 50)
    return

In [31]:
sentiment_analysis('Logistic Regression')

Model: Logistic Regression
Accuracy: 0.58

              precision    recall  f1-score   support

           0       0.41      0.10      0.16       749
           1       0.47      0.21      0.29      2921
           2       0.61      0.88      0.72      8401
           3       0.49      0.37      0.42      3598
           4       0.54      0.17      0.25      1014

    accuracy                           0.58     16683
   macro avg       0.50      0.35      0.37     16683
weighted avg       0.55      0.58      0.53     16683



In [None]:
sentiment_analysis('Decision Tree')

In [None]:
sentiment_analysis('Naive Bayes')