<a href="https://colab.research.google.com/github/stepthom/NLP_course/blob/main/document_classification/imdb_classification_simple_hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification Slides - IMDB - Hybrid
Predict the rating of a review, given it's text.

- Stephen W. Thomas
- Used for MMAI 891 and MMA/GMMA 865.
- Simple version
- Hybrid approach: deep document embeddings, shallow ML

# Install Packages

In [1]:
pip install sentence-transformers



In [7]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/stepthom/NLP_course/main/data/imdb.small.csv")
df = df.head(1000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1000 non-null   int64 
 1   score   1000 non-null   int64 
 2   rating  1000 non-null   object
 3   en      1000 non-null   object
dtypes: int64(2), object(2)
memory usage: 31.4+ KB


In [8]:
df.head()

Unnamed: 0,id,score,rating,en
0,5998,1,negative,The basic formula for the original series was...
1,4615,1,negative,I may not have the longest of attention-spans...
2,8429,3,negative,"""Disappointing"" is the best word I could thin..."
3,10453,10,positive,"While traveling by train through Europe, the ..."
4,6941,1,negative,"This movie is not only boring, it is also rea..."


In [9]:
# Transformers needs an integer, not a string
df['label'] = df['rating'].apply(lambda x: 0.0 if 'negative' in x else 1.0)

# Use sentence-transformers to get embeddings for all documents

In [10]:
from sklearn.model_selection import train_test_split

X = df['en']
y = df['rating']

In [11]:
from sentence_transformers import SentenceTransformer, LoggingHandler

embedder = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")
X_embeddings = embedder.encode(X, convert_to_numpy=True)

In [18]:
X_embeddings

array([[-0.01832552,  0.0092653 ,  0.01703419, ...,  0.05160045,
        -0.05342208,  0.02237599],
       [ 0.04297066,  0.04379047,  0.0026869 , ...,  0.04601925,
         0.00802376,  0.00879848],
       [-0.06287102,  0.06738323,  0.01197166, ...,  0.0332901 ,
         0.0462017 , -0.01272006],
       ...,
       [-0.02681036, -0.01677864,  0.00085934, ...,  0.06568431,
        -0.01436954,  0.02100147],
       [-0.00955857,  0.02377558, -0.00338542, ..., -0.01840511,
         0.03451698, -0.01472617],
       [ 0.060069  , -0.02505747, -0.01127515, ...,  0.0141512 ,
         0.03746534, -0.01229963]], dtype=float32)

In [20]:
X_embeddings.shape

(1000, 768)

# Split Data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(800, 768)
(200, 768)


# Train Model

In [13]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()
estimator.fit(X_train, y_train)

RandomForestClassifier()

# Eval Model

In [14]:
y_test_pred =estimator.predict(X_test)

In [15]:
from sklearn.metrics.cluster import adjusted_mutual_info_score, adjusted_rand_score
from sklearn.metrics import f1_score, classification_report

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

    negative       0.85      0.84      0.84        94
    positive       0.86      0.87      0.86       106

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200



In [16]:
from sklearn.metrics.cluster import adjusted_mutual_info_score, adjusted_rand_score

adjusted_rand = adjusted_rand_score(y_test, y_test_pred)
adjusted_mutual_info = adjusted_mutual_info_score(y_test, y_test_pred)

print("ARI: {}".format(adjusted_rand))
print("AMI: {}".format(adjusted_mutual_info))
    

ARI: 0.5016079697231037
AMI: 0.39941408171792253
