<a href="https://colab.research.google.com/github/sahanyafernando/My_NLP_Learning/blob/main/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Demonstration : Text Classification

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

### Step 1: Loading the Dataset

In [6]:
train_df = pd.read_csv("sentiment_analysis.csv", encoding='latin-1').sample(10, random_state=42)
test_df = pd.read_csv("sentiment_analysis.csv", encoding='latin-1')

In [7]:
train_df.head()

Unnamed: 0,review,sentiment
9,I am worried about the second wave,Negative
11,Recovered patients are increasing,Positive
0,"I loved the movie, it was fantastic!",positive
13,Mask wearing is mandatory in public places,Neutral
5,Covid cases are rising rapidly,Negative


### Selecting relevant columns

In [10]:
train_df = train_df[['review', 'sentiment']]
test_df = test_df[['review', 'sentiment']]

### Standardizing labels

In [11]:
def standardize_sentiment(Sentiment):
    if Sentiment in ['Positive', 'Extremely Positive']:
        return 1
    elif Sentiment in ['Negative', 'Extremely Negative']:
        return 0
    else:
        return 2 # Neutral

In [12]:
train_df['sentiment'] = train_df['sentiment'].apply(standardize_sentiment)
test_df['sentiment'] = test_df['sentiment'].apply(standardize_sentiment)
print("Dataset Loaded:")
print(train_df.head())

Dataset Loaded:
                                        review  sentiment
9           I am worried about the second wave          0
11           Recovered patients are increasing          1
0         I loved the movie, it was fantastic!          2
13  Mask wearing is mandatory in public places          2
5               Covid cases are rising rapidly          0


### Step 2: Preprocessing text(lowercasing, punctuation removal)

In [13]:
def preprocess_text(text):
    text = text.lower() # Lowercasing
    text = re.sub(r'[^a-z0-9\s]', '', text) # Punctuation & special characters removal
    return text

train_df['review'] = train_df['review'].apply(preprocess_text)
test_df['review'] = test_df['review'].apply(preprocess_text)

### Step 3: Feature extraction using TF-IDF

In [14]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train = vectorizer.fit_transform(train_df['review'])
X_test = vectorizer.transform(test_df['review'])
Y_train = train_df['sentiment']
Y_test = test_df['sentiment']


In [15]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 36 stored elements and shape (10, 34)>

### Step 4: Training three models(Naive Bayes, SVM, Decision Tree)


In [29]:
def train_and_evaluate(model, name):
  print(f"\nTraining {name}...")
  model.fit(X_train, Y_train)
  Y_pred = model.predict(X_test)
  accuracy = accuracy_score(Y_test, Y_pred)
  print(f"\n{name} Performance:")
  print(f"Accuracy: {accuracy:.4f}")
  print(classification_report(Y_test, Y_pred))

  models = {
      "Naive Bayes": MultinomialNB(alpha=0.5),
      "Support Vector Machine": SVC(kernel='linear', C=0.5, probability=False), # Avoid extra computation from probability extimation
      "Decision Tree": DecisionTreeClassifier(max_depth=5)
  }


Step 5: Evaluating model performance

In [30]:
for name, model in models.items():
  train_and_evaluate(model, name)





Training Naive Bayes...

Naive Bayes Performance:
Accuracy: 0.8000
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       1.00      0.75      0.86         4
           2       0.78      0.88      0.82         8

    accuracy                           0.80        15
   macro avg       0.81      0.76      0.78        15
weighted avg       0.81      0.80      0.80        15


Training Support Vector Machine...

Support Vector Machine Performance:
Accuracy: 0.5333
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         4
           2       0.53      1.00      0.70         8

    accuracy                           0.53        15
   macro avg       0.18      0.33      0.23        15
weighted avg       0.28      0.53      0.37        15


Training Decision Tree...

Decision Tree Performance:
Accuracy: 0.8000
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Step 6: Comparing models to identify the best one

In [31]:
best_model = max(models, key=lambda name: accuracy_score(Y_test, models[name].predict(X_test)))
print(f"\nBest Performing Models: {best_model}")


Best Performing Models: Naive Bayes
