In [1]:
!pip install nltk #installing nltk



In [2]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.corpus import stopwords
import string


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#load data
df = pd.read_csv('/content/reviews_with_sentiment.csv',header=0)

df.head()

Unnamed: 0,Review,Sentiment
0,I love this product! It exceeded my expectations.,Positive
1,This is the worst purchase I've ever made.,Negative
2,Absolutely fantastic! Highly recommended.,Positive
3,I'm very disappointed with the quality.,Negative
4,"It's okay, not great but not terrible either.",Positive


### data cleaning

In [None]:
#check data shape
df.shape

(1000, 2)

In [None]:
print(df.columns)


Index(['Review', 'Sentiment'], dtype='object')


In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the DataFrame
df = pd.read_csv('/content/reviews_with_sentiment.csv',header=0)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Verify the columns
print(df.columns)

Index(['Review', 'Sentiment'], dtype='object')


In [None]:
# 2. Preprocessing Function
def preprocess_text(text):
    # Tokenize, remove punctuation, and lowercase
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum()]

    # Remove stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Optionally, apply stemming/lemmatization (not done here)

    return ' '.join(tokens)

# Apply preprocessing
df['processed_reviews'] = df['Review'].apply(preprocess_text)

# 3. Split the Data into Training and Testing Sets
X = df['processed_reviews']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Build a Pipeline for TF-IDF and SVM
pipelineSVM = Pipeline([
    ('tfidf', TfidfVectorizer()),    # TF-IDF feature extraction
    ('svm', SVC())                   # SVM model
])

# 5. Hyperparameter Tuning with GridSearchCV (Optional)
param_grid = {
    'tfidf__max_df': [0.7, 0.8, 0.9, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf']
}

grid_search = GridSearchCV(pipelineSVM, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# 6. Evaluate the Model
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        75
     Neutral       1.00      1.00      1.00         9
    Positive       1.00      1.00      1.00       116

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [None]:
from sklearn.linear_model import LogisticRegression

# Build a Pipeline for TF-IDF and Logistic Regression
pipelineLR = Pipeline([
    ('tfidf', TfidfVectorizer()),    # TF-IDF feature extraction
    ('logreg', LogisticRegression()) # Logistic Regression model
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Logistic Regression
Accuracy: 0.98
Classification Report:
               precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        75
     Neutral       1.00      0.56      0.71         9
    Positive       0.97      1.00      0.98       116

    accuracy                           0.98       200
   macro avg       0.99      0.85      0.90       200
weighted avg       0.98      0.98      0.98       200



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Build a Pipeline for TF-IDF and Random Forest
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),    # TF-IDF feature extraction
    ('rf', RandomForestClassifier()) # Random Forest model
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Random Forest
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        75
     Neutral       1.00      1.00      1.00         9
    Positive       1.00      1.00      1.00       116

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [None]:
#save model as a pickle file

filename = 'sentimental_analysis.pkl'
pickle.dump(pipelineSVM, open(filename, 'wb'))