In [1]:
!pip3 install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m722.4 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.5.0


In [9]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import re
import string
import joblib 
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Load the Sentiment140 dataset
data = pd.read_csv('../Sentiment140.csv', encoding='latin-1', usecols=[0, 5], names=['target', 'text'])
data['target'] = data['target'].map({0: 0, 4: 1})  # Convert to binary (0: Negative, 1: Positive)

In [4]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text.lower()  # Convert to lowercase

In [5]:
# Clean the text data
data['text'] = data['text'].apply(clean_text)

In [6]:
# Split data into features and target
X = data['text']
y = data['target']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)

In [10]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust max_features as needed

In [11]:
# Fit and transform the training data, and transform the testing data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
# Create and train the LightGBM model
model = lgb.LGBMClassifier()
model.fit(X_train_tfidf, y_train)


[LightGBM] [Info] Number of positive: 639494, number of negative: 640506
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 22.046040 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 312049
[LightGBM] [Info] Number of data points in the train set: 1280000, number of used features: 9996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499605 -> initscore=-0.001581
[LightGBM] [Info] Start training from score -0.001581


In [13]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

In [19]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 76.01%
              precision    recall  f1-score   support

           0       0.78      0.72      0.75    159494
           1       0.74      0.80      0.77    160506

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000



In [15]:
# Save the model and vectorizer
joblib.dump(model, 'model_LGBM.pkl')
joblib.dump(vectorizer, 'vectorizer_LGBM.pkl')


['tfidf_vectorizer.pkl']

In [27]:
import joblib
import re
import string

# Function to clean the input text
def clean_text(text):
    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Removing mentions and hashtags
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

# Load the saved model and vectorizer
model = joblib.load('model_LGBM.pkl')
vectorizer = joblib.load('vectorizer_LGBM.pkl')

# Function to predict sentiment for new statements
def predict_sentiment(text):
    # Clean and transform the input text using the loaded TF-IDF vectorizer
    clean_input_text = clean_text(text)
    text_vectorized = vectorizer.transform([clean_input_text])
    
    # Predict using the saved LightGBM model
    prediction = model.predict(text_vectorized)
    
    return 'Positive' if prediction[0] == 1 else 'Negative'

# Example usage
if __name__ == '__main__':
    test_text = "I love this product"
    predicted_sentiment = predict_sentiment(test_text)
    print(predicted_sentiment)


Positive
