## Importing various required libraries and data.

In [5]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
pd.set_option('display.max_columns',100)

In [6]:
data_0 = pd.read_excel('/content/Training Data.xlsx',header=1)
data_0.head()

Unnamed: 0,Company Name,Exchange:Ticker,Company Type,Company Status,Geographic Locations,Business Description,Industry Classifications,Security Tickers
0,"024 Pharma, Inc. (OTCPK:EEIG)",OTCPK:EEIG,Public Company,Operating,United States of America (Primary),"024 Pharma, Inc. provides healthcare products ...",Beauty Care Products (Primary); Consumer Stapl...,OTCPK:EEIG
1,"1-800-FLOWERS.COM, Inc. (NasdaqGS:FLWS)",NasdaqGS:FLWS,Public Company,Operating,United States of America (Primary),"1-800-Flowers.com, Inc., together with its sub...","Catalog Flowers, Gifts and Novelties (Primary)...",NasdaqGS:FLWS; DB:FWC
2,12 Retech Corporation (OTCPK:RETC),OTCPK:RETC,Public Company,Operating,United States of America (Primary),"12 Retech Corporation, through its subsidiarie...",Information Technology (Primary); Internet Sof...,OTCPK:RETC
3,"1347 Property Insurance Holdings, Inc. (Nasdaq...",NasdaqGM:PIH,Public Company,Operating,United States of America (Primary),"1347 Property Insurance Holdings, Inc., throug...",Casualty (Primary); Financials (Primary); Fire...,NasdaqGM:PIH
4,1847 Holdings LLC (OTCPK:EFSH),OTCPK:EFSH,Public Company,Operating,United States of America (Primary),"1847 Holdings LLC, through its subsidiaries, p...",Commercial and Professional Services (Primary)...,OTCPK:EFSH


### Taking only the required columns and performing some text preprocessing stpes to remove the noisy data.

In [7]:
data_filtered = data_0[['Business Description','Industry Classifications']].copy()
data_filtered.head()

Unnamed: 0,Business Description,Industry Classifications
0,"024 Pharma, Inc. provides healthcare products ...",Beauty Care Products (Primary); Consumer Stapl...
1,"1-800-Flowers.com, Inc., together with its sub...","Catalog Flowers, Gifts and Novelties (Primary)..."
2,"12 Retech Corporation, through its subsidiarie...",Information Technology (Primary); Internet Sof...
3,"1347 Property Insurance Holdings, Inc., throug...",Casualty (Primary); Financials (Primary); Fire...
4,"1847 Holdings LLC, through its subsidiaries, p...",Commercial and Professional Services (Primary)...


In [14]:
print(data_filtered.shape)
print("-"*50)
print(data_filtered.info())
print("-"*50)
print(data_filtered.isnull().sum())

(2002, 2)
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2002 entries, 0 to 2001
Data columns (total 2 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Business Description      2002 non-null   object
 1   Industry Classifications  2002 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None
--------------------------------------------------
Business Description        0
Industry Classifications    0
dtype: int64


In [8]:
data_filtered['Industry Classifications'][0]

'Beauty Care Products (Primary); Consumer Staples (Primary); Hair Care Products (Primary); Household and Personal Products (Primary); Personal Products (Primary); Personal Products (Primary); Skin Care Products (Primary); Vitamins and Nutritional Supplements (Primary); Healthcare; Pharmaceutical Products; Pharmaceuticals; Pharmaceuticals; Pharmaceuticals, Biotechnology and Life Sciences'

In [9]:
data_filtered['Business Description'][0]

'024 Pharma, Inc. provides healthcare products worldwide. Its products include vitamin and mineral supplements; stress release, joint, heart health, and weight-loss products; and skin care, hair, and anti-aging products. The company was formerly known as B Green Innovations, Inc. and changed its name to 024 Pharma, Inc. in October 2016. 024 Pharma, Inc. was incorporated in 2004 and is based in West Palm Beach, Florida.'

In [10]:
data_filtered['Industry Classifications'] = data_filtered['Industry Classifications'].apply(lambda x:x.split(';')[0])
data_filtered.head(10)

Unnamed: 0,Business Description,Industry Classifications
0,"024 Pharma, Inc. provides healthcare products ...",Beauty Care Products (Primary)
1,"1-800-Flowers.com, Inc., together with its sub...","Catalog Flowers, Gifts and Novelties (Primary)"
2,"12 Retech Corporation, through its subsidiarie...",Information Technology (Primary)
3,"1347 Property Insurance Holdings, Inc., throug...",Casualty (Primary)
4,"1847 Holdings LLC, through its subsidiaries, p...",Commercial and Professional Services (Primary)
5,"1867 Western Financial Corporation, through it...",Banks (Primary)
6,1mage Software Inc operates in the technology ...,Application Software (Primary)
7,"1PM Industries, Inc. provides consulting servi...",Commercial and Professional Services (Primary)
8,1st Capital Bank provides various banking prod...,Banks (Primary)
9,"1st Colonial Bancorp, Inc. operates as the ban...",Banks (Primary)


In [11]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]-')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
data_filtered['Business Description'] = data_filtered['Business Description'].apply(clean_text)
data_filtered['Business Description'] = data_filtered['Business Description'].str.replace('\d+', '')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
data_filtered['Business Description'][0]

' pharma inc provides healthcare products worldwide products include vitamin mineral supplements stress release joint heart health weightloss products skin care hair antiaging products company formerly known b green innovations inc changed name  pharma inc october   pharma inc incorporated  based west palm beach florida'

### Filtering out the empty rows and dropping them


In [15]:
blanks = []  # start with an empty list

for i,desc,label in data_filtered.itertuples():  # iterate over the DataFrame
    if type(desc)==str:            # avoid NaN values
        if desc.isspace() or desc == "":         # test 'Business Description' for whitespace or empty string
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

32 blanks:  [262, 508, 666, 728, 753, 775, 866, 885, 892, 1017, 1034, 1158, 1208, 1215, 1232, 1368, 1414, 1481, 1484, 1505, 1551, 1564, 1566, 1593, 1651, 1664, 1675, 1747, 1762, 1774, 1803, 1819]


In [16]:
data_filtered.dropna(axis=0,inplace=True)
data_filtered.drop(blanks, inplace=True)
data_filtered.shape

(1970, 2)

## Splitting our data into training and testing data.

In [17]:
X = data_filtered['Business Description']
y = data_filtered['Industry Classifications']
print(X.shape,y.shape)

(1970,) (1970,)


In [19]:
# y.value_counts()
# for i in y.unique():
#   print(i)
len(y.unique())

201

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [22]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1379,), (591,), (1379,), (591,))

## Creating a custom tokenizer using Spacy library for tokenizing our data through TF-IDF

In [13]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [23]:
tfidf = TfidfVectorizer(tokenizer=spacy_tokenizer,lowercase=True,analyzer='word',stop_words='english',max_df=0.95,min_df=1)

## Fitting our training data on few classifiers and choose the best one

In [24]:
# ML Classifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import MultinomialNB
classifier = RandomForestClassifier(max_depth=None,n_estimators=500,criterion='gini',random_state=42)
# classifier = LogisticRegression(penalty='l2',multi_class='auto',random_state =42)
# classifier = MultinomialNB()
# classifier = DecisionTreeClassifier(criterion='gini',random_state=42)
# Create pipeline using TF-IDF
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f1b2a09dc50>),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.95, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_wo...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_im

In [25]:
pipe.score(X_train,y_train)

1.0

## Predicting on our test data and printing the Classification report along with score. 

In [26]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
# print("Random Forest Accuracy:",metrics.accuracy_score(y_test, predicted))
# print("Random Forest Precision:",metrics.precision_score(y_test, predicted))
# print("Random Forest Recall:",metrics.recall_score(y_test, predicted))
# print("Confusion Matrix \n:",metrics.confusion_matrix(y_test,predicted))
print(pipe.score(X_test,y_test))
print("-"*50)
print("Classification Report \n:",metrics.classification_report(y_test,predicted))


0.43824027072758037
--------------------------------------------------
Classification Report 
:                                                             precision    recall  f1-score   support

                                     Accessories (Primary)       0.00      0.00      0.00         1
                                Accessory Stores (Primary)       0.00      0.00      0.00         1
                              Accident Insurance (Primary)       0.00      0.00      0.00         1
                             Accounting Software (Primary)       0.00      0.00      0.00         1
                          Adhesives And Sealants (Primary)       0.00      0.00      0.00         2
                                     Advertising (Primary)       1.00      0.20      0.33         5
                           Aerospace and Defense (Primary)       1.00      0.14      0.25         7
                           Agricultural Products (Primary)       0.00      0.00      0.00         1
   

  _warn_prf(average, modifier, msg_start, len(result))


## Importing the Test data file and performing preprocessing steps and using our classifier to predict for the classes.

In [27]:
testing_data = pd.read_excel('/content/Company_ Business Description.xlsx')
testing_filtered = testing_data[['Business Description']].copy()
testing_filtered.head()

Unnamed: 0,Business Description
0,"024 Pharma, Inc. provides healthcare products ..."
1,"1-800-Flowers.com, Inc., together with its sub..."
2,"12 Retech Corporation, through its subsidiarie..."
3,"1347 Property Insurance Holdings, Inc., throug..."
4,"1847 Holdings LLC, through its subsidiaries, p..."


In [28]:
testing_filtered['Business Description'] = testing_filtered['Business Description'].apply(clean_text)
testing_filtered['Business Description'] = testing_filtered['Business Description'].str.replace('\d+', '')
testing_filtered.head()

Unnamed: 0,Business Description
0,pharma inc provides healthcare products world...
1,flowerscom inc together subsidiaries provides ...
2,retech corporation subsidiaries operates inte...
3,property insurance holdings inc subsidiaries ...
4,holdings llc subsidiaries provides range prod...


In [29]:
# Predicting with a test dataset
test_predicted = pipe.predict(testing_filtered['Business Description'])

In [None]:
testing_filtered['Business Description'].shape,test_predicted.shape

((8425,), (8425,))

## Combining the predicted labels with our original dataset and saving the file.

In [31]:
testing_data['Predicted Industry ClasClassification'] = test_predicted
testing_data.head()

Unnamed: 0,Company Name,Exchange:Ticker,Company Type,Company Status,Geographic Locations,Business Description,Security Tickers,Predicted Industry ClasClassification
0,"024 Pharma, Inc. (OTCPK:EEIG)",OTCPK:EEIG,Public Company,Operating,United States of America (Primary),"024 Pharma, Inc. provides healthcare products ...",OTCPK:EEIG,Beauty Care Products (Primary)
1,"1-800-FLOWERS.COM, Inc. (NasdaqGS:FLWS)",NasdaqGS:FLWS,Public Company,Operating,United States of America (Primary),"1-800-Flowers.com, Inc., together with its sub...",NasdaqGS:FLWS; DB:FWC,"Catalog Flowers, Gifts and Novelties (Primary)"
2,12 Retech Corporation (OTCPK:RETC),OTCPK:RETC,Public Company,Operating,United States of America (Primary),"12 Retech Corporation, through its subsidiarie...",OTCPK:RETC,Banks (Primary)
3,"1347 Property Insurance Holdings, Inc. (Nasdaq...",NasdaqGM:PIH,Public Company,Operating,United States of America (Primary),"1347 Property Insurance Holdings, Inc., throug...",NasdaqGM:PIH,Casualty (Primary)
4,1847 Holdings LLC (OTCPK:EFSH),OTCPK:EFSH,Public Company,Operating,United States of America (Primary),"1847 Holdings LLC, through its subsidiaries, p...",OTCPK:EFSH,Commercial and Professional Services (Primary)


In [32]:
testing_data.to_csv('predicted_data_file.csv',index=False)

# Well done!!! Good Job...