In [1]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
os.getcwd()
entertainment=  pd.read_csv('Dataset/Entertainment/Entertainment_Dataset.csv')
entertainment.columns = ['unnamed', 'values', 'category']
insurance=  pd.read_csv('./Dataset/insurance/insurance_dataset.csv')
insurance.columns = ['unnamed', 'values', 'category']
finance=  pd.read_csv('./Dataset/finance/finance_dataset.csv')
finance.columns = ['unnamed', 'values', 'category']
travel = pd.read_csv('./Dataset/travel/Travel_Dataset.csv')
travel.columns = ['unnamed', 'values', 'category']
medical = pd.read_csv('./Dataset/Medical/medical_dataset.csv')
medical.columns = ['unnamed', 'values', 'category']
l = [insurance, entertainment, finance, travel, medical]

In [2]:
df = pd.concat(l)
df.drop('unnamed', axis = 1)

Unnamed: 0,values,category
0,can you borrow against globe Life Insurancebor...,insurance
1,do Medicare cover my spouseif your spouse have...,insurance
2,what happen when you change homeowner insuranc...,insurance
3,what be a typical renter insurance costI be su...,insurance
4,what be car insurance base oncar insurance rat...,insurance
...,...,...
13195,"([""Intact function of the Forkhead Box P2 (FOX...",medical
13196,"([""Studies on ADHD in educational settings ind...",medical
13197,(['The mechanisms underlying cerebellar learni...,medical
13198,(['Withania somnifera root extract has been us...,medical


In [4]:
from collections import Counter
Counter(df['category'])

Counter({'insurance': 1000,
         'entertainment': 386,
         'finance': 44999,
         nan: 10,
         'travel': 957,
         'medical': 13200})

In [5]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['values'])
print(X.shape)

(60552, 194122)


In [6]:
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X)
print(X.shape)

(60552, 194122)


In [8]:
categories = list(dict(Counter(df['category'])).keys())

In [9]:
mp = {categories[x]:x for x in range(len(categories))}

y_true = []
for x in df['category']:
    y_true.append(mp[x])

In [10]:
import numpy as np
y_true = np.array(y_true)
y_true.shape

(60552,)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=0.2, random_state=42)

In [12]:
# Train the model
clf = LogisticRegression(max_iter = 300)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=300)

In [13]:
# Evaluate the model
from sklearn.metrics import balanced_accuracy_score as bas
from sklearn.metrics import roc_auc_score as ras
from sklearn.metrics import confusion_matrix as cm

y_pred = clf.predict(X_test)

accuracy = bas(y_test, y_pred)
print(f"Accuracy: {accuracy}")

conf_m = cm(y_test, y_pred)


y_pred = clf.predict_proba(X_test)
accuracy = ras(y_test, y_pred, multi_class='ovr')
print(f"roc_auc_score: {accuracy}")

print(f"confusion matrix: \n {conf_m}")

Accuracy: 0.794493364741443
roc_auc_score: 0.941083674013868
confusion matrix: 
 [[ 196    0   12    0    0    0]
 [   0   61   12    0    0    0]
 [   3    0 9020    0    0    1]
 [   0    0    3    0    0    0]
 [   0    0    1    0  193    0]
 [   0    0   14    0    0 2595]]


In [14]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)

BernoulliNB()

In [15]:
y_pred = clf.predict(X_test)

accuracy = bas(y_test, y_pred)
print(f"Accuracy: {accuracy}")

conf_m = cm(y_test, y_pred)


y_pred = clf.predict_proba(X_test)
accuracy = ras(y_test, y_pred, multi_class='ovr')
print(f"roc_auc_score: {accuracy}")

print(f"confusion matrix: \n {conf_m}")

Accuracy: 0.33517647000516965
roc_auc_score: 0.9000721546795232
confusion matrix: 
 [[   1    0  206    0    0    1]
 [   5    4   32    0    0   32]
 [  14    5 9001    0    0    4]
 [   0    0    2    0    0    1]
 [   0    0  194    0    0    0]
 [   0    0  120    0    0 2489]]


In [21]:
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV
clf = CalibratedClassifierCV(svm.LinearSVC())
clf.fit(X_train, y_train)

CalibratedClassifierCV(base_estimator=LinearSVC())

In [22]:
y_pred = clf.predict(X_test)

accuracy = bas(y_test, y_pred)
print(f"Accuracy: {accuracy}")

conf_m = cm(y_test, y_pred)


y_pred = clf.predict_proba(X_test)
accuracy = ras(y_test, y_pred, multi_class='ovr')
print(f"roc_auc_score: {accuracy}")

print(f"confusion matrix: \n {conf_m}")

Accuracy: 0.8862394425190022
roc_auc_score: 0.9953047695525984
confusion matrix: 
 [[ 208    0    0    0    0    0]
 [   0   72    1    0    0    0]
 [   3    0 9018    0    0    3]
 [   0    0    2    1    0    0]
 [   0    0    0    0  194    0]
 [   0    0    4    0    0 2605]]


In [18]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
clf = CalibratedClassifierCV(make_pipeline(StandardScaler(with_mean=False),
                    SGDClassifier(max_iter=1000, tol=1e-3)))
clf.fit(X_train, y_train)

CalibratedClassifierCV(base_estimator=Pipeline(steps=[('standardscaler',
                                                       StandardScaler(with_mean=False)),
                                                      ('sgdclassifier',
                                                       SGDClassifier())]))

In [19]:
y_pred = clf.predict(X_test)

accuracy = bas(y_test, y_pred)
print(f"Accuracy: {accuracy}")

conf_m = cm(y_test, y_pred)

y_pred = clf.predict_proba(X_test)
accuracy = ras(y_test, y_pred, multi_class='ovr')
print(f"roc_auc_score: {accuracy}")

print(f"confusion matrix: \n {conf_m}")

Accuracy: 0.34863920991913444
roc_auc_score: 0.9602332341459827
confusion matrix: 
 [[  38    0  170    0    0    0]
 [   0   10   63    0    0    0]
 [   0    0 9024    0    0    0]
 [   0    0    3    0    0    0]
 [   0    0  166    0   28    0]
 [   0    0  971    0    0 1638]]


In [24]:
new_data = [
            'flights from mumbai to japan',
           '5000 rupees dollar to be transferred tommorow',
           'health insurance is realy expensive',
           'the pericardium is swollen',
           'the Lord Of The Rings is a great movie',
           'liver surgery is needed',
           'The largest organ on the body is the skin also known as derma',
            'the train is late again',
            'let us get on the bus',
            'we can go to the train station using the car'
           ]
X_new = vectorizer.transform(new_data)
X_new = tfidf_transformer.transform(X_new)
y_new = clf.predict(X_new)
for x in y_new:
    print(categories[x])

finance
finance
insurance
finance
finance
medical
medical
finance
finance
insurance
