In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import warnings

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
%matplotlib inline

warnings.filterwarnings('ignore')

### Loading the dataset

In [None]:
df = pd.read_csv('training.csv', encoding = 'ISO-8859-1')
df.head()

In [None]:
column_names = ['target','ID','date','flag','user','text']
df = pd.read_csv('training.csv', names = column_names, encoding = 'ISO-8859-1')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['target'].value_counts()
# first 8 million tweets are negative and next 8 million tweets are positive

##### converting the value of 4 to 1 in the target column

In [None]:
df.replace({'target':{4:1}}, inplace=True)

In [None]:
df.head()

### Data Preprocessing

In [None]:
#convert to text to lower case
df['text'] = df['text'].str.lower()

In [None]:
#removing all urls from text
df['text'] = df['text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

In [None]:
#removing symbols and punctuations
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [None]:
#removing numbers as it is not relevant here
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))

In [96]:
df.head()

Unnamed: 0,target,ID,date,flag,user,text,tokens,predicted_sentiment,true_sentiment_mapped
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,switchfoot a thats a bummer you shoulda got...,switchfoot thats bummer shoulda get david carr...,negative,negative
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he cant update his facebook by t...,upset cant update facebook texting might cry r...,negative,negative
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan i dived many times for the ball manag...,kenichan dive many time ball manage save rest ...,positive,negative
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire,negative,negative
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass no its not behaving at all im ...,nationwideclass behave im mad cant see,negative,negative


### Tokenization & Lemmatization

In [None]:
column = 'text' 
df['tokens'] = df[column].apply(word_tokenize)
df.head()

In [None]:
stop_words = set(stopwords.words('english'))

for index, tokens in df['tokens'].items():
    df.at[index, 'tokens'] = [word for word in tokens if word.lower() not in stop_words]

df.head()

In [None]:
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

In [None]:
lemmatizer = WordNetLemmatizer()

# Function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if no match

In [None]:
def lemmatize_tokens(tokens):
    pos_tags = nltk.pos_tag(tokens)  # Get POS tags for tokens
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return lemmatized

# Apply lemmatization to the 'tokens' column
df['tokens'] = df['tokens'].apply(lemmatize_tokens)

df.head()

##### Since the tokens column is a list of tokens and tf-idf (Term Frequency-Inverse Document Frequency) requires string input, we will convert it into string

In [None]:
df['tokens'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
print(df['tokens'].values)

### Train_test_split and vectorization

In [None]:
from sklearn.model_selection import train_test_split

X = df['tokens'].values
Y = df['target'].values

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=1/3, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

X_train = tfidf_vectorizer.fit_transform(X_train)

X_val = tfidf_vectorizer.transform(X_val)
X_test = tfidf_vectorizer.transform(X_test)

### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [98]:
nb_model = MultinomialNB()
nb_model.fit(X_train, Y_train)

# Prediction on validation set
y_val_pred = nb_model.predict(X_val)

# evaluation of model on validation set
print("Validation Performance:")
accuracy = accuracy_score(Y_val, y_val_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(Y_val, y_val_pred))

Validation Performance:
Accuracy: 0.75758125
              precision    recall  f1-score   support

           0       0.74      0.80      0.77    159532
           1       0.78      0.72      0.75    160468

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000



In [97]:
# Predict on the test set
y_test_pred = nb_model.predict(X_test)

# Evaluathetion of model on the test set
print("Test Performance:")
accuracy = accuracy_score(Y_test, y_test_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(Y_test, y_test_pred))

Test Performance:
Accuracy: 0.75900625
              precision    recall  f1-score   support

           0       0.74      0.80      0.77     79829
           1       0.78      0.72      0.75     80171

    accuracy                           0.76    160000
   macro avg       0.76      0.76      0.76    160000
weighted avg       0.76      0.76      0.76    160000



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}

# Grid Search for optimal alpha
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

# Best parameters
print(f"Best Alpha: {grid_search.best_params_}")

# Evaluate the best model
best_nb_model = grid_search.best_estimator_
y_val_pred = best_nb_model.predict(X_val)
print("Validation Performance (After Tuning):")
print(classification_report(Y_val, y_val_pred))

## Testing on user input

In [92]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    
    tokens = word_tokenize(text)
    
    lemmatized_tokens = lemmatize_tokens(tokens)
    
    # Join tokens back into a single string (optional, based on vectorizer's requirements)
    return ' '.join(lemmatized_tokens)

sentiment_map = {0: 'negative', 1: 'positive'}

In [95]:
while True:
    # Take user input
    user_input = input("Enter a sentence for sentiment analysis (or type 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting the program.")
        break

    # Preprocess the input
    preprocessed_sentence = preprocess_text(user_input)

    # Vectorize the input using the trained TfidfVectorizer
    X_input = tfidf_vectorizer.transform([preprocessed_sentence])

    # Predict sentiment
    predicted_sentiment = nb_model.predict(X_input)[0]  # Get the first (and only) prediction

    # Map the prediction to its corresponding sentiment label
    sentiment_label = sentiment_map[predicted_sentiment]

    print(f"Predicted Sentiment: {sentiment_label}\n")

Enter a sentence for sentiment analysis (or type 'exit' to quit):  my good luck is very bad


Predicted Sentiment: positive



Enter a sentence for sentiment analysis (or type 'exit' to quit):  my bad luck is very good


Predicted Sentiment: positive



Enter a sentence for sentiment analysis (or type 'exit' to quit):  my bad luck is always bad


Predicted Sentiment: negative



Enter a sentence for sentiment analysis (or type 'exit' to quit):  exit


Exiting the program.
