In [22]:
# import nltk

# # Download the stopwords corpus
# nltk.download('stopwords')

# # Now you can safely import and use stopwords
# from nltk.corpus import stopwords

In [23]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

# Load dataset
train_df = pd.read_csv(r"D:\sentiment-analysis-webapp\data\train.csv",encoding='ISO-8859-1')
test_df = pd.read_csv(r"D:\sentiment-analysis-webapp\data\test.csv",encoding='ISO-8859-1')

# Preprocess data
def preprocess_text(text):
    tokenizer = TweetTokenizer()
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))

    tokens = tokenizer.tokenize(text)
    stemmed = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(stemmed)

# Ensure text columns are of type string
train_df['text'] = train_df['text'].astype(str)
test_df['text'] = test_df['text'].astype(str)

# Now apply your preprocessing function
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vectors = vectorizer.fit_transform(train_df['cleaned_text'])

# Model
model = LogisticRegression()
model.fit(X_train_vectors, train_df['sentiment'])

In [25]:
X_test_vectors = vectorizer.transform(test_df['cleaned_text'])
predictions = model.predict(X_test_vectors)

# Evaluation
print("Accuracy:", accuracy_score(test_df['sentiment'], predictions))

Accuracy: 0.6975099037917374


In [28]:
import streamlit as st

st.title('Sentiment Analysis of Tweets')

user_input = st.text_area("Enter Tweet Text")

if st.button('Predict Sentiment'):
    processed_input = preprocess_text(user_input)
    input_vector = vectorizer.transform([processed_input])
    prediction = model.predict(input_vector)
    sentiment = 'Positive' if prediction == 4 else 'Negative' if prediction == 0 else 'Neutral'
    st.write('Sentiment:', sentiment)

In [29]:
train_df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,cleaned_text
0,cb774db0d1,"I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,"i ` respond , i go"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative,noon,21-30,Albania,sooo sad i miss san diego ! ! !
2,088c60f138,my boss is bullying me...,negative,night,31-45,Algeria,boss bulli ...
3,9642c003ef,what interview! leave me alone,negative,morning,46-60,Andorra,interview ! leav alon
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative,noon,60-70,Angola,"son * * * , ` put releas alreadi bought"


In [30]:
train_df.isnull().sum()

textID           0
text             0
sentiment        0
Time of Tweet    0
Age of User      0
Country          0
cleaned_text     0
dtype: int64