In [13]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Load the dataset
data = pd.read_csv('sentiment_dataset.csv', encoding='latin-1', header=None)
data.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Filter necessary columns
data = data[['target', 'text']]
data['target'] = data['target'].map({0: 'negative', 4: 'positive'})

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @ mentions
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Remove hashtags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.lower()  # Convert to lowercasees
    return text

# Apply preprocessing
    text = re.sub(r'\s+', ' ', text)  # Remove extra spac
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Display sample data
print(data.head())


     target                                               text  \
0  negative  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1  negative  is upset that he can't update his Facebook by ...   
2  negative  @Kenichan I dived many times for the ball. Man...   
3  negative    my whole body feels itchy and like its on fire    
4  negative  @nationwideclass no, it's not behaving at all....   

                                        cleaned_text  
0    - a that's a bummer.  you shoulda got david ...  
1  is upset that he can't update his facebook by ...  
2   i dived many times for the ball. managed to s...  
3    my whole body feels itchy and like its on fire   
4   no, it's not behaving at all. i'm mad. why am...  


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the dataset
X = data['cleaned_text']
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.79048125
              precision    recall  f1-score   support

    negative       0.80      0.78      0.79    159494
    positive       0.78      0.80      0.79    160506

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000



In [9]:
import joblib

joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']