# Sentiments Classification

Dataset- https://www.kaggle.com/competitions/tweet-sentiment-extraction/data?select=train.csv

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib
import re

# Load the dataset
df = pd.read_csv('sentiment_detection_dataset.csv', encoding='latin1')

df = df.dropna()

# Map the label column values
label_mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

# Apply the mapping
df['label'] = df['label'].map(label_mapping)

FileNotFoundError: [Errno 2] No such file or directory: 'sentiment_detection_dataset.csv'

In [None]:
# Define cleaning functions
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_non_ascii(text):
    # Remove non-ASCII characters
    return ''.join(char for char in text if ord(char) < 128)

def remove_digits(text):
    # Remove numeric digits
    return re.sub(r'\d+', '', text)

def remove_special_characters(text):
    # Remove special characters except whitespace
    return re.sub(r'[^\w\s]', '', text)

def normalize_case(text):
    # Normalize text to lowercase
    return text.lower()

def clean_text(text):
    # Remove URLs
    text = remove_urls(text)
    # Remove non-ASCII characters
    text = remove_non_ascii(text)
    # Remove numeric digits
    text = remove_digits(text)
    # Remove special characters except whitespace
    text = remove_special_characters(text)
    # Normalize case
    text = normalize_case(text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply cleaning functions to the 'comment' column
df['comment'] = df['comment'].apply(clean_text)

In [None]:
# Split dataset
X = df['comment']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a pipeline with TfidfVectorizer and Naive Bayes classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy}')

Model Accuracy: 0.6284570596797671


In [None]:
# Save the model to a file
joblib.dump(pipeline, 'sentiment_detection_model.pkl')

['sentiment_detection_model.pkl']

In [None]:
# Load the model from the file
model = joblib.load('sentiment_detection_model.pkl')

# Create a DataFrame with dummy values
data = {
    'comment': [
        'ayas stupid overacting',
        'khushis episode coming',
        'popatlal randwa marega',
        'apne mujhe bataa ya nhi ki apki beti bhi hai',
        'kya dimagggg hai popatlal',
        'arre oo yeh kya matlb kuch bhi family show hai yaar mat dikhao rre baba aisa kuch bhi',
        'tarak bhai mehta anjali ben mehta are really good pair',
        'if there were no girl there were no one in the world because a girl give birth to everyone',
        'popotlal ki shadi hogi rcb jab jitegi',
        'son is son till wife daughter is daughter till life',
        'khusi episodes coming',
        'i like taraak mehta ka ooltah chansma',
        'polar bhai ki shaadi jald hi jai',
        'Loved this product, will buy again.',
        'Claim your free vacation today!',
        'The quality of this product is amazing.',
        'You have been selected for a special offer.',
        'nice show'
    ]
}

new_data = pd.DataFrame(data)


# Basic data cleaning on new data
new_data['comment'] = new_data['comment'].str.lower()  # Convert to lowercase
new_data['comment'] = new_data['comment'].str.replace(r'\d+', '', regex=True)  # Remove numbers
new_data['comment'] = new_data['comment'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
new_data['comment'] = new_data['comment'].str.strip()  # Remove whitespace

# Predict using the loaded model
new_predictions = model.predict(new_data['comment'])

# Add predictions to the new data
new_data['spam_prediction'] = new_predictions

print(new_data.head(20))

                                              comment  spam_prediction
0                              ayas stupid overacting                0
1                              khushis episode coming                1
2                              popatlal randwa marega                1
3        apne mujhe bataa ya nhi ki apki beti bhi hai                1
4                           kya dimagggg hai popatlal                1
5   arre oo yeh kya matlb kuch bhi family show hai...                1
6   tarak bhai mehta anjali ben mehta are really g...                2
7   if there were no girl there were no one in the...                2
8               popotlal ki shadi hogi rcb jab jitegi                1
9   son is son till wife daughter is daughter till...                2
10                              khusi episodes coming                1
11              i like taraak mehta ka ooltah chansma                1
12                   polar bhai ki shaadi jald hi jai                1
13    