In [13]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("C:\\Users\\srave\\OneDrive\\Desktop\\twiter\\twitter.csv", encoding='latin-1')
df.columns = ['id', 'label', 'tweet']

# Preprocess the data
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['clean_text'] = df['tweet'].apply(preprocess_text)

# Convert target labels to a classification problem (0: negative, 1: neutral, 2: positive)
df['label'] = df['label'].apply(lambda x: 0 if x == 0 else 2 if x == 4 else 1)

# Verify the unique labels
unique_labels = df['label'].unique()
print("Unique labels in the dataset:", unique_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Build and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Adjust target_names based on unique labels
target_names = []
if 0 in unique_labels:
    target_names.append('Negative')
if 1 in unique_labels:
    target_names.append('Neutral')
if 2 in unique_labels:
    target_names.append('Positive')

print(classification_report(y_test, y_pred, target_names=target_names))


Unique labels in the dataset: [0 1]
Accuracy: 0.95
              precision    recall  f1-score   support

    Negative       0.95      1.00      0.97      5937
     Neutral       0.93      0.32      0.48       456

    accuracy                           0.95      6393
   macro avg       0.94      0.66      0.73      6393
weighted avg       0.95      0.95      0.94      6393



In [7]:
df

Unnamed: 0,0,1,2
0,id,label,tweet
1,1,0,@user when a father is dysfunctional and is s...
2,2,0,@user @user thanks for #lyft credit i can't us...
3,3,0,bihday your majesty
4,4,0,#model i love u take with u all the time in ...
...,...,...,...
31958,31958,0,ate @user isz that youuu?Ã°ÂÂÂÃ°ÂÂÂÃ°ÂÂ...
31959,31959,0,to see nina turner on the airwaves trying to...
31960,31960,0,listening to sad songs on a monday morning otw...
31961,31961,1,"@user #sikh #temple vandalised in in #calgary,..."
