In [143]:
# import libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn import model_selection,metrics,preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense,LSTM,Embedding, SpatialDropout1D,GRU,SimpleRNN
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [134]:
# data import from CSV file
data = pd.read_csv('C:/Users/prachi/Downloads/root2ai - Data.csv')

In [96]:
data.head()

Unnamed: 0,Text,Target
0,reserve bank forming expert committee based in...,Blockchain
1,director could play role financial system,Blockchain
2,preliminary discuss secure transaction study r...,Blockchain
3,security indeed prove essential transforming f...,Blockchain
4,bank settlement normally take three days based...,Blockchain


In [97]:
data.describe()

Unnamed: 0,Text,Target
count,22701,22704
unique,20986,11
top,billion,FinTech
freq,41,8551


### Data Preprocessing

In [98]:
data.isna().sum() # check for NA values

Text      3
Target    0
dtype: int64

In [99]:
data.dropna(inplace=True) # dropping rows with NA since there are very less missing

In [110]:
# data cleaning 
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

data['Text'] = data['Text'].apply(clean_text)

In [116]:
data.iloc[789]

Text      make favorable access security
Target                        Blockchain
Name: 789, dtype: object

In [77]:
# splitting data into train set and test set
train_X, test_X, train_y, test_y = model_selection.train_test_split(data['Text'], data['Target'],test_size=0.20, random_state=42)

In [78]:
train_X.shape

(18160,)

In [79]:
train_y

16080           FinTech
9690            FinTech
2065            Bigdata
6824     Cyber Security
3376            Bigdata
              ...      
11964           FinTech
21578          Reg Tech
5390     Cyber Security
860          Blockchain
15795           FinTech
Name: Target, Length: 18160, dtype: object

In [80]:
# Encoding labels to numbers
encoder = preprocessing.LabelEncoder()
train_y_encode = encoder.fit_transform(train_y)
test_y_encode = encoder.fit_transform(test_y)

In [81]:
list(encoder.classes_)

['Bigdata',
 'Blockchain',
 'Cyber Security',
 'Data Security',
 'FinTech',
 'Microservices',
 'Neobanks',
 'Reg Tech',
 'Robo Advising',
 'Stock Trading',
 'credit reporting']

In [82]:
encoder.inverse_transform([2])

array(['Cyber Security'], dtype=object)

## Model Building

In [83]:
def train_model(classifier, train_data, train_label, test_data):
    model = make_pipeline(TfidfVectorizer(), classifier)
    # fit the training dataset on the classifier
    model.fit(train_data, train_label)
    
    # predict the labels on validation dataset
    predictions_train = model.predict(train_data)
    predictions = model.predict(test_data)
    
    
    return metrics.accuracy_score(predictions_train, train_label),metrics.accuracy_score(predictions, test_y_encode)

### Naive Bayes

In [84]:
acc_train,accuracy = train_model(MultinomialNB(),train_X,train_y_encode, test_X)
print('Accuracy of Train data Naive Bayes Model is ',round(acc_train,3))
print('Accuracy of Test data Naive Bayes Model is ',round(accuracy,3))

Accuracy of Train data Naive Bayes Model is  0.576
Accuracy of Test data Naive Bayes Model is  0.534


### Random Forest Classifier

In [85]:
acc_train,accuracy = train_model(RandomForestClassifier(criterion='entropy',max_depth=50,n_estimators=50),train_X, train_y_encode, test_X)
print('Accuracy of Train data Random Forest Model is ',round(acc_train,3))
print('Accuracy of Random Forest Model is ',round(accuracy,3))

Accuracy of Train data Random Forest Model is  0.616
Accuracy of Random Forest Model is  0.514


### Support Vector Machine

In [86]:
acc_train,accuracy = train_model(SVC(kernel='linear'),train_X, train_y_encode, test_X)
print('Accuracy of Train data SVM Model is ',round(acc_train,3))
print('Accuracy of Test data SVM Model is ',round(accuracy,3))

Accuracy of Train data SVM Model is  0.807
Accuracy of Test data SVM Model is  0.679


### Gradient Boosting

In [88]:
acc_train,accuracy = train_model(GradientBoostingClassifier(n_estimators=1000),train_X, train_y_encode, test_X)
print('Accuracy of Train data Gradient Boosting Model is ',round(acc_train,3))
print('Accuracy of Test data Gradient Boosting Model is ',round(accuracy,3))

Accuracy of Train data Gradient Boosting Model is  0.935
Accuracy of Test data Gradient Boosting Model is  0.641


### DNN - LSTM

In [121]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_X)

In [122]:
X_train = tokenizer.texts_to_sequences(train_X)
X_test = tokenizer.texts_to_sequences(test_X)
vocab_size = len(tokenizer.word_index) + 1

In [123]:
print(train_X[2])
print(X_train[2])

preliminary discuss secure transaction study research payment
[127, 1096, 10, 2181, 251, 1, 2068, 1720, 379]


In [124]:
maxlen = 100
EMBEDDING_DIM = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [144]:
model = Sequential()
model.add(Embedding(5000, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(SimpleRNN(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(13, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 10
batch_size = 64

history = model.fit(X_train, train_y_encode, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Train on 16344 samples, validate on 1816 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
