In [4]:

# importing required libraries
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import nltk



In [5]:
df = pd.read_csv('../dataset/train_cleaned.csv')

In [6]:
df.head()

Unnamed: 0,id,name,document_text,cat_name
0,22474,information regarding merger navios maritime c...,at special meeting held march 24 2021 sharehol...,Corporate Communications
1,27460,announcement approving change membership 2 fut...,on april 2 2021 china financial futures exchan...,Securities Settlement
2,6926,sfc suspends shiu yau wah five months,the securities futures commission sfc suspende...,Antitrust
3,6982,renminbi rmb haircut february 4 2020,pursuant section 262 clearing house procedures...,Securities Settlement
4,5022,antimoney laundering countering financing terr...,money laundering terrorism financing mltf fina...,Financial Crime


In [7]:
df['text'] = df['name'] + df['document_text']
df = df.drop(['name','document_text'],axis=1)

In [8]:
df.columns

Index(['id', 'cat_name', 'text'], dtype='object')

In [9]:
# tokenizing the text
from nltk.tokenize import word_tokenize
df['text'] = df['text'].apply(word_tokenize)


In [10]:
df.head()

Unnamed: 0,id,cat_name,text
0,22474,Corporate Communications,"[information, regarding, merger, navios, marit..."
1,27460,Securities Settlement,"[announcement, approving, change, membership, ..."
2,6926,Antitrust,"[sfc, suspends, shiu, yau, wah, five, monthsth..."
3,6982,Securities Settlement,"[renminbi, rmb, haircut, february, 4, 2020purs..."
4,5022,Financial Crime,"[antimoney, laundering, countering, financing,..."


In [11]:
# steaming
from nltk.stem import PorterStemmer
def stemmer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in text]

df['text'] = df['text'].apply(stemmer)

In [12]:
 df.head()

Unnamed: 0,id,cat_name,text
0,22474,Corporate Communications,"[inform, regard, merger, navio, maritim, conta..."
1,27460,Securities Settlement,"[announc, approv, chang, membership, 2, futur,..."
2,6926,Antitrust,"[sfc, suspend, shiu, yau, wah, five, monthsth,..."
3,6982,Securities Settlement,"[renminbi, rmb, haircut, februari, 4, 2020purs..."
4,5022,Financial Crime,"[antimoney, launder, counter, financ, terror, ..."


In [13]:
# lemmatization
from nltk.stem import WordNetLemmatizer
def lemmatizer(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]

df['text'] = df['text'].apply(lemmatizer)

In [14]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Suraj\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [15]:
# create a pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [16]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])


In [38]:
df.head()

0    inform regard merger navio maritim contain l p...
1    announc approv chang membership 2 futur compan...
2    sfc suspend shiu yau wah five monthsth secur f...
3    renminbi rmb haircut februari 4 2020pursuant s...
4    antimoney launder counter financ terror target...
dtype: object

In [17]:
# train test split
X = df['text']
y = df['cat_name']

In [22]:
def list_to_string(text):
    return ' '.join(text)

X = X.apply(list_to_string)

In [23]:
X.head()

0    inform regard merger navio maritim contain l p...
1    announc approv chang membership 2 futur compan...
2    sfc suspend shiu yau wah five monthsth secur f...
3    renminbi rmb haircut februari 4 2020pursuant s...
4    antimoney launder counter financ terror target...
Name: text, dtype: object

In [35]:
y.head

<bound method NDFrame.head of 0            Corporate Communications
1               Securities Settlement
2                           Antitrust
3               Securities Settlement
4                     Financial Crime
                     ...             
47097            Corporate Governance
47098    Monetary and Economic Policy
47099                           Fraud
47100                      Exemptions
47101                Fees and Charges
Name: cat_name, Length: 47102, dtype: object>

In [36]:
df = X + y

In [37]:
df.head()

0    inform regard merger navio maritim contain l p...
1    announc approv chang membership 2 futur compan...
2    sfc suspend shiu yau wah five monthsth secur f...
3    renminbi rmb haircut februari 4 2020pursuant s...
4    antimoney launder counter financ terror target...
dtype: object

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
predictions = pipeline.predict(X_test)

In [28]:
predictions

array(['Natural Disasters', 'Securities Sales', 'Compliance Management',
       ..., 'Required Disclosures', 'Information Filing',
       'Information Filing'], dtype=object)

In [29]:
len(predictions)

9421

In [30]:
dataframe = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

In [31]:
dataframe.head()

Unnamed: 0,Actual,Predicted
28268,Market Risk,Natural Disasters
29210,Corporate Governance,Securities Sales
23249,Accounting and Finance,Compliance Management
1422,Financial Accounting,Examinations
8940,Delivery,Securities Sales


In [33]:
len(dataframe[dataframe['Actual'] == dataframe['Predicted']])

776

In [34]:
# accuracy
accuracy_score(y_test, predictions)

0.08236917524678909