In [7]:
import pandas as pd
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
df = pd.read_csv('/content/tweets.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [11]:
X = df['tweet']
y = df['label']

In [12]:
#Text Preprocessing

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(X):
    tokens = word_tokenize(X)

    tokens = [word.lower() for word in tokens if word.isalpha()]  # remove punctuation

    tokens = [word for word in tokens if word not in stopwords.words('english')]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [13]:
# Apply Preprocessing

processed_tweets = [preprocess_text(tweet) for tweet in X]

print(processed_tweets[:3])  # Preview the cleaned reviews

['fingerprint pregnancy test http android apps beautiful cute health igers iphoneonly iphonesia iphone', 'finally transparant silicon case thanks uncle yay sony xperia http', 'love would go talk makememories unplug relax iphone smartphone wifi connect http']


In [14]:
# Vectorization + ML Model

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Vectorize
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_tweets)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train classifier
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(accuracy_score(y_test, y_pred))

0.8665824915824916


In [20]:
# Prediction
def predict_sentiment(tweet):
    clean = preprocess_text(tweet)
    vec = vectorizer.transform([clean])
    pred = model.predict(vec)
    return "Positive 😊" if pred[0] == 0 else "Negative 😞"

In [21]:
print(predict_sentiment("Iphone,16 camera is really amazing!"))
print(predict_sentiment("I am not buying I phone 15, because price is too high."))

Positive 😊
Negative 😞
