### Get the data

In [1]:
from pathlib import Path
import pandas as pd

current_directory = Path.cwd()
data_directory = current_directory / '../data/'

order_data = pd.read_csv(data_directory / 'orderbook.csv')
order_data.head(20)

Unnamed: 0,order,OrderType
0,buy 5000 infosys share at market price,BUY
1,sell mahindra 400 unit @ 340,SELL
2,pick 20 tatamotors from market at CMP,BUY
3,add TCS to my portfolio 300 units @ 1200,BUY
4,"L&T purchase @ CMP, quantity 300",BUY
5,invest 50000 in MindTree at market rate,BUY
6,offload 200 positions in infy at day closing p...,SELL
7,3000 TCS shares release at cmp,SELL
8,remove complete position of M&M from my portfo...,SELL
9,Buy Google 560 share at market price,BUY


In [2]:
## Ideally we should use LabelEncoder but for simplicity sake BUY --> 1 and SELL --> 0

order_data['OrderTypeLabel'] = order_data.apply(lambda x: 1 if x['OrderType'] == 'BUY' else 0, axis=1)
order_data.head(10)

Unnamed: 0,order,OrderType,OrderTypeLabel
0,buy 5000 infosys share at market price,BUY,1
1,sell mahindra 400 unit @ 340,SELL,0
2,pick 20 tatamotors from market at CMP,BUY,1
3,add TCS to my portfolio 300 units @ 1200,BUY,1
4,"L&T purchase @ CMP, quantity 300",BUY,1
5,invest 50000 in MindTree at market rate,BUY,1
6,offload 200 positions in infy at day closing p...,SELL,0
7,3000 TCS shares release at cmp,SELL,0
8,remove complete position of M&M from my portfo...,SELL,0
9,Buy Google 560 share at market price,BUY,1


### Do train/test split

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(order_data['order'], order_data['OrderTypeLabel'], train_size=0.75 \
                                                    , random_state=42)

In [4]:
len(X_train)

13

In [5]:
len(y_train)

13

In [6]:
len(X_test)

5

In [7]:
len(y_test)

5

### Use TfIdfVectorizer for feature vectorization (this needs hyperparameter tunning, but dont have sufficient data)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(X_train).toarray()

features.shape

(13, 83)

In [5]:
X_train = features

### Model Training (there are multiple classification model which needs gridsearch to pick best)
### Once we have sufficient training data we can apply that too

In [6]:
from sklearn.svm import LinearSVC

model = LinearSVC()
_ = model.fit(X_train, y_train)

### Testing Model

In [7]:
X_test_vectorized = tfidf.transform(X_test)
y_pred = model.predict(X_test_vectorized)
y_pred

array([1, 0, 0, 1, 1], dtype=int64)

In [8]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

1.0

### Convert prediction to actual output

In [14]:
input_X = pd.Series(X_test.tolist())

order_type_prediction = pd.Series(y_pred)

In [17]:
final_output = pd.DataFrame({'order':input_X, 'OrderType':order_type_prediction})
final_output['OrderType'] = final_output.apply(lambda x: 'BUY' if x['OrderType']==1 else 'SELL', axis=1)

final_output

Unnamed: 0,order,OrderType
0,buy 5000 infosys share at market price,BUY
1,sell mahindra 400 unit @ 340,SELL
2,remove complete position of M&M from my portfo...,SELL
3,invest 50000 in MindTree at market rate,BUY
4,add TCS to my portfolio 300 units @ 1200,BUY
