In [321]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import string
import emoji
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [322]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

#Data Reading

In [323]:
# Load dataset
df = pd.read_csv('sentimentdataset.csv')

# Drop unnecessary columns
df.drop(columns=['ID', 'User'], inplace=True)

# Text Pre-processing

In [324]:
# Define preprocessing functions
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text).lower().strip()

def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [325]:
chat_words = {
    "AFAIK": "As Far As I Know", "AFK": "Away From Keyboard", "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard", "ATM": "At The Moment", "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard", "BBL": "Be Back Later", "BBS": "Be Back Soon",
    "BFN": "Bye For Now", "B4N": "Bye For Now", "BRB": "Be Right Back",
    "BRT": "Be Right There", "BTW": "By The Way", "B4": "Before",
    "CU": "See You", "CUL8R": "See You Later", "CYA": "See You",
    "FAQ": "Frequently Asked Questions", "FC": "Fingers Crossed", "FWIW": "For What It's Worth",
    "FYI": "For Your Information", "GAL": "Get A Life", "GG": "Good Game",
    "GN": "Good Night", "GMTA": "Great Minds Think Alike", "GR8": "Great!",
    "G9": "Genius", "IC": "I See", "ICQ": "I Seek you", "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion", "IMO": "In My Opinion", "IOW": "In Other Words",
    "IRL": "In Real Life", "KISS": "Keep It Simple, Stupid", "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off", "LOL": "Laughing Out Loud", "LTNS": "Long Time No See",
    "L8R": "Later", "MTE": "My Thoughts Exactly", "M8": "Mate",
    "NRN": "No Reply Necessary", "OIC": "Oh I See", "PITA": "Pain In The A..",
    "PRT": "Party", "PRW": "Parents Are Watching", "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing", "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off", "SK8": "Skate",
    "STATS": "Your sex and age", "ASL": "Age, Sex, Location", "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!", "TTYL": "Talk To You Later", "U": "You",
    "U2": "You Too", "U4E": "Yours For Ever", "WB": "Welcome Back",
    "WTF": "What The F...", "WTG": "Way To Go!", "WUF": "Where Are You From?",
    "W8": "Wait...", "7K": "Sick:-D Laugher", "TFW": "That feeling when",
    "MFW": "My face when", "MRW": "My reaction when", "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh", "JK": "Just kidding", "IDC": "I don't care",
    "ILY": "I love you", "IMU": "I miss you", "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired", "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes", "BAE": "Before anyone else", "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink", "BWL": "Bursting with laughter",
    "BFF": "Best friends forever", "CSL": "Can't stop laughing"
}

In [326]:
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

def remove_stopwords(text):
    stopword = stopwords.words('english')
    new_text = ['' if word in stopword else word for word in text.split()]
    return " ".join(new_text)

def lem_words(word_list):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(word) for word in word_list]


In [343]:
# Complete preprocessing function
def preprocess_text(text):
    text = remove_html_tags(text)
    text = remove_url(text)
    text = emoji.demojize(text)
    text = remove_punc(text)
    text = chat_conversion(text)
    text = remove_stopwords(text)
    text = word_tokenize(text)
    text = lem_words(text)
    return " ".join(text)

# Apply preprocessing to dataset
df['Sentiment (Label)'] = df['Sentiment (Label)'].apply(remove_html_tags).apply(remove_url).str.lower().str.strip()
df['Text'] = df['Text'].apply(preprocess_text)


In [357]:
for text in ["happy", "sad", "okay", "love", "hate","positive","negative"]:
    print(f"{text}: {analyzer.polarity_scores(text)['compound']}")

happy: 0.5719
sad: -0.4767
okay: 0.2263
love: 0.6369
hate: -0.5719
positive: 0.5574
negative: -0.5719


In [360]:
# Sentiment categorization with stricter thresholds
analyzer = SentimentIntensityAnalyzer()

def categorize_sentiment(word):
    score = analyzer.polarity_scores(word)['compound']
    if score >= 0.5:  # Stricter threshold
        return 'positive'
    elif score <= -0.4:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_category (y)'] = df['Sentiment (Label)'].apply(categorize_sentiment)


In [361]:
for words in df[['Sentiment (Label)','sentiment_category (y)']].values:
    print(words)


['positive' 'positive']
['negative' 'negative']
['positive' 'positive']
['positive' 'positive']
['neutral' 'neutral']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['negative' 'negative']
['neutral' 'neutral']
['positive' 'positive']
['negative' 'negative']
['positive' 'positive']
['positive' 'positive']
['neutral' 'neutral']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['neutral' 'neutral']
['negative' 'negative']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']
['positive' 'positive']


In [362]:
label_encoder = LabelEncoder()
df['sentiment_category (y)'] = label_encoder.fit_transform(df['sentiment_category (y)'])

# Check class distribution
print("Class distribution:\n", df['sentiment_category (y)'].value_counts())
print("Label mappings:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


Class distribution:
 sentiment_category (y)
1    429
2    180
0    123
Name: count, dtype: int64
Label mappings: {'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}


In [363]:
# Vectorize text
vectorizer_text = TfidfVectorizer()
X = vectorizer_text.fit_transform(df['Text'])

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, df['sentiment_category (y)'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=5)

In [364]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [365]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Perform cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, df['sentiment_category (y)'], cv=kfold)

# Print the cross-validation scores
print("Cross-validation scores:", scores)

# Print the mean and standard deviation of the cross-validation scores
print("Mean accuracy:", scores.mean())
print("Standard deviation:", scores.std())

predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Cross-validation scores: [0.63265306 0.53061224 0.56164384 0.73287671 0.69863014]
Mean accuracy: 0.6312831982107913
Standard deviation: 0.07725672845961074
Accuracy: 0.8255813953488372


# **Logistic Regression**

In [366]:
# Train LogisticRegression with balanced class weights
classifier = LogisticRegression(class_weight='balanced', random_state=0)
classifier.fit(X_train, y_train)

# Cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(classifier, X_balanced, y_balanced, cv=kfold)
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())
print("Standard deviation:", scores.std())

# Test set evaluation
y_pred = classifier.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Cross-validation scores: [0.91860465 0.90310078 0.92217899 0.90661479 0.91828794]
Mean accuracy: 0.9137574276837691
Standard deviation: 0.00747696934787137
Test Accuracy: 0.8992248062015504
Classification Report:
               precision    recall  f1-score   support

    negative       0.91      0.94      0.92        71
     neutral       0.89      0.82      0.86        91
    positive       0.90      0.94      0.92        96

    accuracy                           0.90       258
   macro avg       0.90      0.90      0.90       258
weighted avg       0.90      0.90      0.90       258






# **SVM**

In [367]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear')  # You can choose different kernels such as 'linear', 'rbf', 'poly', etc.

# Train the SVM classifier
svm_classifier.fit(X_train, y_train)

svm_scores = cross_val_score(svm_classifier, X, df['sentiment_category (y)'], cv=kfold)

# Print the cross-validation scores
print("Cross-validation scores:", svm_scores)

# Print the mean and standard deviation of the cross-validation scores
print("Mean accuracy:", svm_scores.mean())
print("Standard deviation:", svm_scores.std())

# Predict on the test set
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_svm)
print("Accuracy:", accuracy)



Cross-validation scores: [0.65986395 0.68707483 0.66438356 0.73972603 0.75342466]
Mean accuracy: 0.7008946044171093
Standard deviation: 0.03866482449751172
Accuracy: 0.9224806201550387


In [368]:
# Complete preprocessing function for a single input
def preprocess_text(text):
    text = remove_html_tags(text)
    text = remove_url(text)
    text = emoji.demojize(text)
    text = remove_punc(text)
    text = chat_conversion(text)
    text = remove_stopwords(text)
    text = word_tokenize(text)
    text = lem_words(text)
    return " ".join(text) # Join the list of tokens back into a string

# Initialize models
models = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(class_weight='balanced', random_state = 0, max_iter=1000),
    'RandomForest': RandomForestClassifier(class_weight='balanced', random_state=0, n_estimators=100),
    'SVM': SVC(class_weight='balanced', random_state=0, kernel='linear')
}

In [369]:
# Train and evaluate each model
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\n--- {name} ---")
    # Train model
    model.fit(X_train, y_train)
    # Cross-validation
    scores = cross_val_score(model, X, df['sentiment_category (y)'], cv=kfold)
    print("Cross-validation scores:", scores)
    print("Mean accuracy:", scores.mean())
    print("Standard deviation:", scores.std())
    # Test set evaluation
    y_pred = model.predict(X_test)
    print("Test Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=label_encoder.transform(label_encoder.classes_)))


--- MultinomialNB ---
Cross-validation scores: [0.63265306 0.53061224 0.56164384 0.73287671 0.69863014]
Mean accuracy: 0.6312831982107913
Standard deviation: 0.07725672845961074
Test Accuracy: 0.8255813953488372
Classification Report:
               precision    recall  f1-score   support

    negative       0.80      0.99      0.88        71
     neutral       0.93      0.56      0.70        91
    positive       0.80      0.96      0.87        96

    accuracy                           0.83       258
   macro avg       0.84      0.83      0.82       258
weighted avg       0.84      0.83      0.81       258

Confusion Matrix:
 [[70  1  0]
 [17 51 23]
 [ 1  3 92]]

--- LogisticRegression ---
Cross-validation scores: [0.7414966  0.71428571 0.76027397 0.71917808 0.80821918]
Mean accuracy: 0.7486907091603765
Standard deviation: 0.03401254078498386
Test Accuracy: 0.8992248062015504
Classification Report:
               precision    recall  f1-score   support

    negative       0.91      

In [370]:
# Predict sentiment for a single input across all models
def predict_sentiment(input_text, models):
    processed_text = preprocess_text(input_text)
    text_features = vectorizer_text.transform([processed_text])
    predictions = {}
    for name, model in models.items():
        prediction = model.predict(text_features)
        predictions[name] = label_encoder.inverse_transform(prediction)[0]
    return predictions

In [373]:
##Test with diverse inputs
#test_inputs = ["I love this!", "This is terrible", "It's okay", "Hate this product", "Neutral comment"]
#print("\nTest Inputs Predictions:")
#for text in test_inputs:
 #   predictions = predict_sentiment(text, models)
  #  print(f"\nInput: {text}")
   # for model_name, pred in predictions.items():
     #   print(f"  {model_name}: {pred}")

# Interactive input
print("\nInteractive Testing:")
while True:
    x = input("Enter a word or sentence to predict sentiment (or 'exit' to quit): ")
    if x.lower() == 'exit':
        break
    predictions = predict_sentiment(x, models)
    print(f"\nInput: {x}")
    for model_name, pred in predictions.items():
        print(f"  {model_name}: {pred}")


Interactive Testing:
Enter a word or sentence to predict sentiment (or 'exit' to quit): exit
