In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import re
import warnings
import string
import os
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

warnings.filterwarnings('ignore')

In [2]:
TrainingData = pd.read_csv('C:\\Users\\rahul\\OneDrive\\Documents\\Enterprise\\train.csv')
TestingData = pd.read_csv('C:\\Users\\rahul\\OneDrive\\Documents\\Enterprise\\test.csv')
df = pd.concat([TrainingData, TestingData])

In [3]:
def remove_unnecessary_characters(text):
    text = re.sub(r'<.*?>', '', str(text))
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text

In [4]:
df['clean_text'] = df['text'].apply(remove_unnecessary_characters)


In [5]:
def tokenize_text(text):
    try:
        text = str(text)
        tokens = word_tokenize(text)
        return tokens
    except Exception as e:
        print(f"Error tokenizing text: {e}")
        return []

df['tokens'] = df['text'].apply(tokenize_text)

In [6]:
def normalize_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    else:
        text = str(text)
    return text

df['normalized_text'] = df['text'].apply(normalize_text)

In [7]:
def remove_stopwords(text):
    if isinstance(text, str):
        words = text.split()        
        filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
        filtered_text = ' '.join(filtered_words)
    else:
        filtered_text = ''
    return filtered_text
df['text_without_stopwords'] = df['text'].apply(remove_stopwords)

In [8]:
df.dropna(inplace=True)

In [9]:
df = df.drop(columns=['textID', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], errors='ignore')


In [10]:
def wp(text):
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [11]:
df['selected_text'] = df["selected_text"].apply(wp)
X = df['selected_text']
y = df['sentiment']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 4  # Positive
    elif analysis.sentiment.polarity == 0:
        return 2  # Neutral
    else:
        return 0  # Negative

In [14]:
y_pred_test = X_test.apply(get_sentiment)


In [15]:
y_test_numeric = y_test.map({'negative': 0, 'neutral': 2, 'positive': 4})


In [16]:
print("\nTest Data Evaluation:")
print('Accuracy:', accuracy_score(y_test_numeric, y_pred_test))
print(classification_report(y_test_numeric, y_pred_test, target_names=["Negative", "Neutral", "Positive"]))


Test Data Evaluation:
Accuracy: 0.5589519650655022
              precision    recall  f1-score   support

    Negative       0.66      0.44      0.53      1572
     Neutral       0.50      0.52      0.51      2236
    Positive       0.58      0.72      0.64      1688

    accuracy                           0.56      5496
   macro avg       0.58      0.56      0.56      5496
weighted avg       0.57      0.56      0.55      5496



In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
XV_train = vectorization.fit_transform(X_train)
XV_test = vectorization.transform(X_test)

In [19]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=0)
rfc.fit(XV_train, y_train)

RandomForestClassifier(random_state=0)

In [20]:
pred_rfc = rfc.predict(XV_test)

In [21]:
score_rfc = rfc.score(XV_test, y_test)
score_rfc

0.8114992721979621

In [22]:
print(classification_report(y_test, pred_rfc))


              precision    recall  f1-score   support

    negative       0.78      0.75      0.77      1572
     neutral       0.77      0.89      0.83      2236
    positive       0.92      0.76      0.83      1688

    accuracy                           0.81      5496
   macro avg       0.82      0.80      0.81      5496
weighted avg       0.82      0.81      0.81      5496



In [None]:
def predict_sentiment(text, model, vectorizer):
    
    clean_text = wp(text)

    text_vectorized = vectorizer.transform([clean_text])

    prediction = model.predict(text_vectorized)[0]

    sentiment_map = {
        "positive": "Positive",
        "negative": "Negative",
        "neutral": "Neutral"
    }

    return sentiment_map.get(prediction.lower(), "Unknown sentiment")


In [35]:
text1 = "I love this product, it's amazing!"
text2 = "This is the worst experience I've ever had."
text3 = "It's okay, not too bad but not great either."

print(f"Text: {text1}\nSentiment: {predict_sentiment(text1, rfc, vectorization)}\n")
print(f"Text: {text2}\nSentiment: {predict_sentiment(text2, rfc, vectorization)}\n")
print(f"Text: {text3}\nSentiment: {predict_sentiment(text3, rfc, vectorization)}\n")


Text: I love this product, it's amazing!
Sentiment: Positive

Text: This is the worst experience I've ever had.
Sentiment: Negative

Text: It's okay, not too bad but not great either.
Sentiment: Neutral



In [None]:
def predict_sentiment(text, model, vectorizer):
    """
    Predict the sentiment of a given text string.

    Parameters:
    - text: The input text string.
    - model: The trained machine learning model.
    - vectorizer: The fitted TfidfVectorizer.

    Returns:
    - A string indicating whether the sentiment is "Positive", "Negative", or "Neutral".
    """
    clean_text = wp(text)

    text_vectorized = vectorizer.transform([clean_text])

    prediction = model.predict(text_vectorized)[0]

    sentiment_map = {
        "positive": "Positive",
        "negative": "Negative",
        "neutral": "Neutral"
    }

    return sentiment_map.get(prediction.lower(), "Unknown sentiment")


while True:
    user_input = input("Enter a text string (or type 'exit' to quit): ")

    if user_input.lower() == 'exit':
        break

    sentiment = predict_sentiment(user_input, rfc, vectorization)

    print(f"Sentiment: {sentiment}\n")


Enter a text string (or type 'exit' to quit): I love my class
Sentiment: Positive

Enter a text string (or type 'exit' to quit): i hate my class
Sentiment: Negative

Enter a text string (or type 'exit' to quit): my class is okay
Sentiment: Neutral

