[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1B5E4prkIXC6r3seFYViUyvMEfQVPoSYr)

# Sphere Transformers Class Day 1 (Pretrain) - MultiModal Model

This notebook uses text embeddings to create a multimodal model. 


In [None]:
%pip install --quiet transformers datasets sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.2 MB/s[0m e

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics


## Get Data

In [None]:
dataset_url = "https://raw.githubusercontent.com/georgian-io/Multimodal-Toolkit/master/datasets/Womens_Clothing_E-Commerce_Reviews/Womens%20Clothing%20E-Commerce%20Reviews.csv"

In [None]:
# read the dataset into dataframe and drop NAN
dataset_csv = pd.read_csv(dataset_url, index_col=[0])
print("Raw dataset size: {size}".format(size = len(dataset_csv)))
dataset_csv = dataset_csv.dropna().reset_index(drop = True)
print("None-NAN dataset size: {size}".format(size = len(dataset_csv)))

Raw dataset size: 23486
None-NAN dataset size: 19662


In [None]:
dataset_csv.drop(columns = ['Clothing ID', 'Rating'], inplace = True)

In [None]:
dataset_csv.head(6)

Unnamed: 0,Age,Title,Review Text,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,60,Some major design flaws,I had such high hopes for this dress and reall...,0,0,General,Dresses,Dresses
1,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",1,0,General Petite,Bottoms,Pants
2,47,Flattering shirt,This shirt is very flattering to all due to th...,1,6,General,Tops,Blouses
3,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",0,4,General,Dresses,Dresses
4,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,1,1,General Petite,Tops,Knits
5,39,"Shimmer, surprisingly goes with lots","I ordered this in carbon for store pick up, an...",1,4,General Petite,Tops,Knits


You may want to subset the dataset even smaller to start with

## Preprocess data for Modeling

In [None]:
from sklearn.model_selection import train_test_split
#dataset_csv = dataset_csv[:1000]
y = dataset_csv.pop('Recommended IND')

In [None]:
X_train,X_test,y_train,y_test = train_test_split(dataset_csv,y,test_size=0.2,random_state=43)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder

In [None]:
categorical_features = ['Division Name', 'Department Name' , 'Class Name']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder())
    ])

numeric_features = ['Age','Positive Feedback Count']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],sparse_threshold=0.0,remainder='drop')

## Model Training on Tabular Features

In [None]:
from sklearn.ensemble import RandomForestClassifier
modelrf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier())])

In [None]:
modelrf.fit(X_train, y_train)

In [None]:
preds = modelrf.predict(X_train)
fpr, tpr, thresholds = metrics.roc_curve(y_train, preds)
print(f'train AUC: {metrics.auc(fpr,tpr):0.2f}')

train AUC: 0.68


In [None]:
preds = modelrf.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds)
print(f'test AUC: {metrics.auc(fpr,tpr):0.2f}')

test AUC: 0.50


## Get Text Embeddings

I did this by getting two numpy arrays - ans_train & ans_test

## Concat Embeddings to Dataset

I concatenated my numpy arrays to create a new training and test dataset: X_trainm and X_testnm

In [None]:
X_trainm = pd.concat([X_train.reset_index(drop=True),pd.DataFrame(ans_train).reset_index(drop=True)],axis=1)
X_testnm = pd.concat([X_test.reset_index(drop=True),pd.DataFrame(ans_test).reset_index(drop=True)],axis=1)

NameError: ignored

In [None]:
##Making sure all the column names are strings
X_trainm.columns = X_trainm.columns.astype(str)
X_testnm.columns = X_testnm.columns.astype(str)

## Model pipeline with Tabular and Text

The pipeline is modified to allow all the new numeric embeddings to be used

In [None]:
categorical_features = ['Division Name', 'Department Name' , 'Class Name']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OrdinalEncoder())
    ])

##Adding all numeric features
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_features = X_trainm.select_dtypes(include=numerics).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
        ],remainder='drop') #sparse_threshold=0.0

## Model Training with Multimodal (Text + Tabular)

In [None]:
modelrf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier())])

In [None]:
modelrf.fit(X_trainm, y_train)

In [None]:
preds = modelrf.predict(X_trainm)
fpr, tpr, thresholds = metrics.roc_curve(y_train, preds)
print(f'train AUC: {metrics.auc(fpr,tpr):0.2f}')

In [None]:
preds = modelrf.predict(X_testnm)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds)
print(f'test AUC: {metrics.auc(fpr,tpr):0.2f}')