In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Check if text is a string before applying lower()
    if isinstance(text, str):
        text = text.lower().replace('[^\w\s]', '')
        return ' '.join([word for word in text.split() if word not in stop_words])
    # Handle non-string values (e.g., NaN)
    else:
        return ''  # Or any other suitable handling for non-string values

df = pd.read_csv('sentiment_data.csv')
df = df[['text', 'sentiment']]  # Ensure columns 'text' and 'sentiment'
df['cleaned_text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df.head()

Unnamed: 0,text,sentiment,cleaned_text
0,im getting on borderlands and i will murder yo...,Positive,"im getting borderlands murder ,"
1,I am coming to the borders and I will kill you...,Positive,"coming borders kill all,"
2,im getting on borderlands and i will kill you ...,Positive,"im getting borderlands kill all,"
3,im coming on borderlands and i will murder you...,Positive,"im coming borderlands murder all,"
4,im getting on borderlands 2 and i will murder ...,Positive,"im getting borderlands 2 murder all,"


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['sentiment'], test_size=0.2, random_state=42)


In [4]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

model = MultinomialNB()
model.fit(X_train_vec, y_train)

predictions = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, predictions))


Accuracy: 0.7551717212291624


In [6]:
X_test['predicted_sentiment'] = predictions
X_test.to_csv('predicted_sentiments.csv', index=False)
