# Custom text classification using OCI Language Service Endpoint

This Notebook demonstrates how to call batch text classification API to classify text using custom text classification model from OCI Language

In [1]:
import time
import oci
import pandas as pd
import math
import datetime

from typing import Any, Dict, List, Union
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


## Initialize OCI AI Client

Make sure you have setup config file by following steps mentioned in </br>
[OCI Langauge Service Live lab Task-1](https://apexapps.oracle.com/pls/apex/dbpm/r/livelabs/run-workshop?p210_wid=887&p210_wec=&session=108183149172107)

In [2]:
ai_client = oci.ai_language.AIServiceLanguageClient(oci.config.from_file(profile_name='AISERVICESPM'))
ai_client.base_client.timeout = 30
wait_between_batch = 0
wait_between_retries = 5

Read the input dataset

In [3]:
df = pd.read_csv('~/Downloads/35_rep_data_feb_1.csv', nrows=20)

### split the dataset into batches
OCI endpoint can process upto 500 characters per second. OCI Language batch API has a limit of max 100 documents and 20k characters

In [4]:
max_rec_for_batch_call = 100
max_tex_per_batch = 18000
model_endpoint = 'ocid1.ailanguageendpoint.oc1.phx.amaaaaaa3nkmftyafnulvuggyditrznsudni26b2csubumikicbunfmz6h6a'

In [5]:
def process_and_upadate_batch(df):
    output = None
    documents = [oci.ai_language.models.TextDocument(key=str(p), text=row['text']) for p,row in df.iterrows()]
    prediction = [None] * len(documents)
    confidence = [None] * len(documents)
    
    classificaton_details = oci.ai_language.models.BatchDetectLanguageTextClassificationDetails(endpoint_id=model_endpoint,documents = documents)
    retry_count = 0
    success=False
    
    start_index = df.index.min()
    
    MAX_RETRYCOUNT=3
    while retry_count <MAX_RETRYCOUNT and success != True:
        try:
            start_time = datetime.datetime.now()
            output = ai_client.batch_detect_language_text_classification(classificaton_details)
            end_time = datetime.datetime.now()
            print(f'{datetime.datetime.now()} processing of {len(documents)} records, total chars: {df.text.str.len().sum()} took :{end_time-start_time}')
            
            index = [int(d.key) for d in output.data.documents ]
            predicted_labels = ['|'.join([c.label for c in d.text_classification])  for d in output.data.documents ]
            #predicted_labels = [d.text_classification[0].label if len(d.text_classification) >0 else None for d in output.data.documents]
            predicted_conf = ['|'.join([str(c.score) for c in d.text_classification])  for d in output.data.documents ]
            errors = [e.key for e in output.data.errors]
            success = True
            
            if len(df) != len(predicted_labels):
                print(f'{datetime.datetime.now()} failed inference for {len(df)-len(predicted_labels)} records')
            
            #dealing with predictoin errors, predicting a class could fail due to max/min length, wrong encoding ,etc
            '''
            i = 0
            for l in predicted_labels:
                if i+start_index in errors:
                    print(f'there was an error at {i+start_index}')
                    prediction[i] = None
                else:
                    prediction[i] = predicted_labels[i]
                    confidence[i] = predicted_conf [i]
                i = i+1
            '''
            prediction = predicted_labels
            confidence = predicted_conf
        except oci.exceptions.ServiceError as e:
            print(f'{datetime.datetime.now()} Unable to process these records {df.index.min()}: {df.index.max}. Retrying {retry_count} time')
            if retry_count == 0: 
                print(f'Error details:{e}')
            time.sleep(wait_between_retries)
        except oci.exceptions.ClientError as e:
            print(f'{datetime.datetime.now()} Error occurred while processing records {df.index.min()}: {df.index.max()}. Retrying {retry_count} time')
            if retry_count == 0: 
                print(f'Error details:{e}')
            time.sleep(wait_between_retries)
        except Exception as e:
            print(f'{datetime.datetime.now()} Error occurred while processing records {df.index.min()}: {df.index.max()}. Retrying {retry_count} time')
            if retry_count == 0: 
                print(f'Error details:{e}')
            time.sleep(wait_between_retries)
        finally:
            retry_count = retry_count +1

    return index, prediction, confidence
    

In [6]:
def process_and_update_slice(df):
    for name, group in df.groupby((df.text.str.len().cumsum()/max_tex_per_batch).apply(math.floor)):
        row_start = 0
        while row_start < group.shape[0]:
            rows = group[row_start:row_start+max_rec_for_batch_call]
            
            print(f'{datetime.datetime.now()} processing rows:{group[row_start:row_start+max_rec_for_batch_call].index.min()}:{group[row_start:row_start+max_rec_for_batch_call].index.max()}')
            
            index, prediction, confidence = process_and_upadate_batch(rows)

            #print(f'sub batch items{row_start}:{row_start+max_rec_for_batch_call} prediction:{len(prediction)}, conf:{len(confidence)}')
            df.loc[index,'predicted'] = prediction
            df.loc[index,'confidence'] = confidence
            
            row_start = row_start + max_rec_for_batch_call
            time.sleep(wait_between_batch)
    print(f'{datetime.datetime.now()} completed processing {len(df)} rows')

Predicting classes

In [8]:
df['predicted']=None
df['confidence'] = None
df['missed'] = None
df['extra'] = None
process_and_update_slice(df)

2023-03-15 18:29:11.505232 processing rows:0:19
2023-03-15 18:29:12.998144 processing of 20 records, total chars: 12593 took :0:00:01.490943
2023-03-15 18:29:13.001266 completed processing 20 rows


In [9]:
#Ignore failed inferences, could be due to wrong encoding format - TBD investiggate further
predicted_df = df.dropna(subset=['predicted', 'confidence']).copy()

In [10]:
y_pred = predicted_df.predicted.str.split('|').apply(sorted)

In [11]:
y_true = predicted_df.labels.str.split('|').apply(sorted)

In [12]:
predicted_df['AllCorrect'] = (y_pred==y_true)

In [13]:
def missed_preds(row):
    preds = []
    if row.predicted is not None and len(row.predicted)>0:
        preds = row.predicted.split('|')
    
    labels = row.labels.split('|')
    missed = sorted(set(labels).difference(preds))
    return missed


def extra_preds(row):
    preds = []
    if row.predicted is not None and len(row.predicted)>0:
        preds = row.predicted.split('|')
    labels = row.labels.split('|')
    extra = sorted(set(preds).difference(labels))
    return extra

predicted_df['missed'] = predicted_df.apply(missed_preds, axis=1)
predicted_df['extra'] = predicted_df.apply(extra_preds, axis=1)

In [14]:
predicted_df.to_csv('output.csv', index=False)

# Code to prepare class metrics

In [22]:
import itertools
labels = set(itertools.chain(*[i for i in y_pred.values if i is not None and len(i)>0])).union(itertools.chain(*[i for i in y_true.values if i is not None and len(i)>0]))

In [23]:
#if '' in labels:
#    labels.remove('')
labels = sorted(list(labels))

In [24]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
binarizer = MultiLabelBinarizer(classes=labels)

In [25]:
y_true_transformed = binarizer.fit_transform(y_true)

In [26]:
#y_pred_mask = y_pred.notnull()

y_pred_transformed_arr = binarizer.transform(y_pred)

In [27]:
y_pred_transformed = binarizer.transform(y_pred)
confusion_matrix = multilabel_confusion_matrix(y_true=y_true_transformed, y_pred=y_pred_transformed)#, labels=labels)

In [29]:
y_true[1],y_true_transformed[1], y_pred[1], y_pred_transformed_arr[1]

(['OPERATIONS'],
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 ['OPERATIONS', 'BAGGAGE HANDLING'],
 array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]))

In [30]:
import numpy as np
labels[np.where(y_pred_transformed[4])[0][0]]

'FUTURE TRAVEL QUESTIONS'

In [31]:
vals = [cm.reshape(-1) for cm in confusion_matrix]
class_metrics = pd.DataFrame(columns=['TN','FP','TP', 'FN'], index=labels, data=vals)
class_metrics.index.name='class'
class_metrics['Precision'] = class_metrics['TP']/(class_metrics.TP+class_metrics.FP)
class_metrics['Recall'] = class_metrics['TP']/(class_metrics.TP+class_metrics.FN)
class_metrics['F1'] = 2*class_metrics['Precision']*class_metrics['Recall']/(class_metrics['Precision']+class_metrics['Recall'])
class_metrics.sort_index().to_csv('~/Downloads/35_rep_data_feb_1_class_metrics.csv')

In [46]:
class_metrics.to_csv('classmetrics.csv')

In [199]:
df.to_csv('~/Downloads/35_rep_data_feb_1_prediction_results.csv', index=False)