# Long Term Stock Price growth prediction using NLP on 10K Financial Report

#### Problem Description: A 10-K FInancial Report is a comprehensive report which must be ﬁled annually by all publicly traded companies about its ﬁnancial performance. These reports are ﬁled to the US Securities Exchange Commission (SEC). This is even more detailed than the annual report of a company. The 10K documents contain information about the Business' operations, risk factors, selected ﬁnancial data, the Management's discussion and analysis (MD&A) and also Financial Statements and supplementary data. 10-K reports are very important for investors and Warren Buffet consistently cites these reports as a great source of information about a company's potential to succeed. In this competition you are expected to build an NLP pipeline that ingests 10-K reports of various publicly traded companies and build a machine learning model which can uncover the hidden signals to predict the long term stock performance of a company from the 10-K docs. 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


#### loading the require library

In [0]:
import pandas as pd
import numpy as np


#!pip install urllib

import urllib.request as url 

#!pip install bs4
from bs4 import BeautifulSoup as bs

#!pip install wordcloud
from wordcloud import WordCloud

import re
import requests
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, recall_score, precision_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import spacy
import en_core_web_sm
# nlp = en_core_web_sm.load()

# Load the language model
nlp = spacy.load('en_core_web_sm')
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

### extraction of the data from the 10k_links

%%time 
final_data = {'ticker':[], 'cik':[], 'filing_date':[], 'text':[], 'long_term_outlook':[]}

for rownum, row in data.iterrows(): ticker = row['ticker'] cik = row['cik'] long_term_outlook = row['long_term_outlook'] link = row['10k_link'] filing_date = row['filing_date'] try: r = requests.get(link) raw_10k = r.text

    # Regex to find <DOCUMENT> tags
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    # Regex to find <TYPE> tag prceeding any characters, terminating at new line
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]
    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
    document = {}
    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]
    full_content = bs(document['10-K'], 'lxml')
    find_text = full_content.select('text')
    #print(full_content.prettify())

    div_text_final = ""   
    for ttext in find_text:
        result = scrub_words(ttext.get_text("\n"))
        div_text_final = div_text_final+" "+result 
    #RiskFactor = scrub_words(RiskFactor)
    if div_text_final != "":
        final_data['ticker'].append(ticker)
        final_data['cik'].append(cik)
        final_data['text'].append(div_text_final)
        final_data['long_term_outlook'].append(long_term_outlook)
        final_data['filing_date'].append(filing_date)
except:
    pass        

#### reading the csv file for train data

In [4]:
%%time
train = pd.read_csv("/content/drive/My Drive/final_ data/train_data_phd.csv")

CPU times: user 8.76 s, sys: 1.44 s, total: 10.2 s
Wall time: 19.6 s


In [5]:
train.head()

Unnamed: 0,ticker,cik,filing_date,10k_link_urls,long_term_outlook,link_text
0,FE,1031296,28-02-2012,https://www.sec.gov/Archives/edgar/data/103129...,0,"\nhtml PUBLIC ""-//W3C//DTD HTML 4.01 Transitio..."
1,CL,21665,19-02-2015,https://www.sec.gov/Archives/edgar/data/21665/...,0,"\nhtml PUBLIC ""-//W3C//DTD HTML 4.01 Transitio..."
2,PRU,1137774,19-02-2016,https://www.sec.gov/Archives/edgar/data/113777...,1,"\nhtml PUBLIC ""-//W3C//DTD HTML 4.01 Transitio..."
3,EBAY,1065088,28-03-2001,https://www.sec.gov/Archives/edgar/data/106508...,1,\n 1\n\n- ----------------------------------...
4,CAM,941548,25-02-2005,https://www.sec.gov/Archives/edgar/data/941548...,1,\ne10vk\n PAGEBREAK \n\n\n \n \nSECURITIES AND...


In [0]:
#  def scrub_words(text):
#     #Replace the "coated links" with space 
#     text = re.sub('"', ' ', text)
    
#     #Replace non ascii / not words and digits
#     text = re.sub("(\\W|\\d)",' ',text)
    
#     #Replace new line characters and following text untill space
#     text = re.sub('\n(\w*?)[\s]', '', text)
    
#     #Remove html markup
#     text = re.sub("<.*?>", ' ', text)
    
#     #Remove extra spaces from the text
#     text = re.sub("\s+", ' ', text)
    
#      #Remove single character's from the text
#     text = re.sub(r"\b[a-zA-Z]\b", "", text)
#     return text

In [7]:
train.shape

(2568, 6)

In [0]:
import copy
print(type(train['link_text']))
original_data = copy.deepcopy(train)
print(train.keys())
print(original_data.keys())

In [10]:
train['link_text'] = [text.strip().lower() for text in train['link_text']]
train['link_text']

0       html public "-//w3c//dtd html 4.01 transitiona...
1       html public "-//w3c//dtd html 4.01 transitiona...
2       html public "-//w3c//dtd html 4.01 transitiona...
3       1\n\n- ---------------------------------------...
4       e10vk\n pagebreak \n\n\n \n \nsecurities and e...
5       html public "-//w3c//dtd html 4.01 transitiona...
6       - --------------------------------------------...
7       - --------------------------------------------...
8       e10vk\n pagebreak \n\n\n \n \nunited states\ns...
9       securities and exchange commission\n          ...
10      1\n                       securities and excha...
11      1\n \n- --------------------------------------...
12      html public "-//w3c//dtd html 4.01 transitiona...
13      securities and exchange commission\n          ...
14      form 10-k\n\nunited states  securities and exc...
15      form 10-k\n\n    united states  securities and...
16      e10vk\n pagebreak \n\n\n\n\n\n\n\n\nunited sta...
17      10-k\n

#### remove of the remove_accented_chars

In [0]:
import unicodedata
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #https://docs.python.org/2/library/unicodedata.html
    return text

In [12]:
%%time
train['link_text'] = [remove_accented_chars(text) for text in train['link_text']]
train['link_text']

CPU times: user 10.7 s, sys: 499 ms, total: 11.2 s
Wall time: 11.2 s


#### removing the scrub_words

In [0]:
def scrub_words(text):
    #Replace the "coated links" with space 
    text = re.sub('"', ' ', text)
    
    #Replace non ascii / not words and digits
    text = re.sub("(\\W|\\d)",' ',text)
    
    #Replace new line characters and following text untill space
    text = re.sub('\n(\w*?)[\s]', '', text)
    
    #Remove html markup
    text = re.sub("<.*?>", ' ', text)
    
    #Remove extra spaces from the text
    text = re.sub("\s+", ' ', text)
    
     #Remove single character's from the text
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    return text

In [14]:
%%time
train['link_text'] = [scrub_words(text) for text in train['link_text']]
train['link_text'] [:5]

CPU times: user 2min 15s, sys: 2.91 s, total: 2min 18s
Wall time: 2min 18s


In [15]:
url_reg  = r'[a-z]*[:.]+\S+'
train['link_text'] = [re.sub(url_reg, '', text) for text in train['link_text']]
train['link_text'][0]

'html public   dtd html transitional en http www  org tr html loose dtd document created using webfilings copyright webfilings llc all rights reserved fe  united states securities and exchange commissionwashington   form  mark one annual report pursuant to section or  of the securities exchange act of for the fiscal year ended december oro transition report pursuant to section or  of the securities exchange act of for the transition period from to commissionfile number registrant state of incorporation address and telephone number    employeridentification no firstenergy corp an ohio corporation south main street akron oh telephone firstenergy solutions corp an ohio corporation   firstenergy corp south main street akron oh telephone ohio edison company an ohio corporation   firstenergy corp south main street akron oh telephone the cleveland electric illuminating company an ohio corporation   firstenergy corp south main street akron oh telephone the toledo edison company an ohio corpora

In [16]:
train['link_text'] = [text.replace('"', '') for text in train['link_text']]
train['link_text'][0]

'html public   dtd html transitional en http www  org tr html loose dtd document created using webfilings copyright webfilings llc all rights reserved fe  united states securities and exchange commissionwashington   form  mark one annual report pursuant to section or  of the securities exchange act of for the fiscal year ended december oro transition report pursuant to section or  of the securities exchange act of for the transition period from to commissionfile number registrant state of incorporation address and telephone number    employeridentification no firstenergy corp an ohio corporation south main street akron oh telephone firstenergy solutions corp an ohio corporation   firstenergy corp south main street akron oh telephone ohio edison company an ohio corporation   firstenergy corp south main street akron oh telephone the cleveland electric illuminating company an ohio corporation   firstenergy corp south main street akron oh telephone the toledo edison company an ohio corpora

In [17]:
%%time
train['link_text'] = train['link_text'].str.replace(r'\d+','')

CPU times: user 15 s, sys: 3.71 ms, total: 15 s
Wall time: 15 s


### checking the word count after removing the scrub_words and the remove_accented_chars

In [18]:
%%time
print("Data Type: ",type(original_data['link_text']))
print("Data Type: ",type(train['link_text']))

print("Length of data: ",len(original_data['link_text'][0]))
print("Length of data: ",len(train['link_text'][0]))

print("Original data: \n",original_data['link_text'][0])
print("\n\n**************************************************************************\n\n")
print("Clean data: \n",train['link_text'][0])

Data Type:  <class 'pandas.core.series.Series'>
Data Type:  <class 'pandas.core.series.Series'>
Length of data:  1204484
Length of data:  1046791
Original data: 
 
html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"
 Document created using WebFilings 1 
 Copyright 2008-2012 WebFilings LLC. All Rights Reserved 
FE-12.31.2011-10K
 UNITED STATES SECURITIES AND EXCHANGE COMMISSIONWASHINGTON, D. C. 20549 FORM 10-K(Mark One)þ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the fiscal year ended December 31, 2011ORo TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the transition period from                      to                       CommissionFile Number Registrant; State of Incorporation;Address; and Telephone Number I.R.S. EmployerIdentification No.     333-21011 FIRSTENERGY CORP. 34-1843785  (An Ohio Corporation)    76 South Main Street    Akron, OH 44308    Telepho

In [20]:
%%time
train['word_count'] = [len(text.split(' ')) for text in train['link_text']]
train.head(3)

CPU times: user 6.26 s, sys: 756 ms, total: 7.01 s
Wall time: 7.02 s


In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [22]:
## load spacy's English stopwords as variable called 'stopwords'

stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(stopwords))
print('First ten stop words: %s' % list(stopwords)[:50])
#stopwords.remove('no')
#stopwords.remove('not')

Number of stop words: 326
First ten stop words: ['whole', 'because', 'itself', 'yours', 'used', 'show', 'whatever', 'what', 'all', 'hereby', '’ve', 'whom', 'unless', 'during', 'my', 'never', 'now', 'become', 'really', 'elsewhere', 'above', 'its', 'here', 'call', 'themselves', 'therein', 'upon', 'where', 'behind', 'something', 'does', 'more', 'therefore', 'from', 'am', 'nevertheless', 'side', 'just', 'keep', 'latter', 'has', 'this', 'two', 'along', 'had', 'she', 'ourselves', 'take', 'bottom', 'might']


In [0]:
# import nltk
# from nltk.stem import WordNetLemmatizer
# w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
# lemmatizer = WordNetLemmatizer()

# def lemmatize_text(text):
#     return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [0]:
# nltk.download('wordnet')
# train['clean_text'] = ""
# train['clean_text'] = train.link_text.apply(lemmatize_text)
# train.head()

In [0]:
# train['clean_text'] = train['clean_text'].apply(lambda x: [item for item in x if item not in stopwords])
# train.head()

In [30]:

train['long_term_outlook'] = train['long_term_outlook'].astype('category')
train.dtypes

ticker                 object
cik                     int64
filing_date            object
10k_link_urls          object
long_term_outlook    category
link_text              object
word_count              int64
dtype: object

In [0]:
X_train, X_val, y_train, y_val = train_test_split(train['link_text'],train['long_term_outlook'], test_size = 0.3,random_state = 123,)

In [32]:
X_train.shape

(1797,)

In [33]:
y_train.head(5)

2503    1
2084    1
1123    0
512     1
981     1
Name: long_term_outlook, dtype: category
Categories (2, int64): [0, 1]

In [34]:
X_val.head()

381      united states securities and exchange commiss...
2306     vk pagebreak united states securities and exc...
289     form  united states securities and exchange co...
402     form k_ htm licensed to firstenergy corp docum...
1567     vk pagebreak united states securities and exc...
Name: link_text, dtype: object

In [35]:
y_val.head()

381     1
2306    1
289     1
402     0
1567    0
Name: long_term_outlook, dtype: category
Categories (2, int64): [0, 1]

In [31]:
# %%time
# from sklearn.feature_extraction.text import TfidfVectorizer

# #define vectorizer parameters
# tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=5000,
#                                  min_df=0.2,
#                                  use_idf=True, ngram_range=(1,4))

# tfidf_matrix = tfidf_vectorizer.fit_transform(train['link_text'])


CPU times: user 13min 44s, sys: 9.41 s, total: 13min 53s
Wall time: 13min 53s


In [0]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [0]:
# temp = list(X_train)
# for i in range(len(temp)):
#   temp[i] = " ".join(temp[i])
# x_tr = pd.DataFrame(temp)

In [83]:
# temp[1]



In [40]:
y_val.head()

381     1
2306    1
289     1
402     0
1567    0
Name: long_term_outlook, dtype: category
Categories (2, int64): [0, 1]

In [0]:
# y_train

In [0]:
# temp = list(X_val)
# for i in range(len(temp)):
#   temp[i] = " ".join(temp[i])
# x_va = pd.DataFrame(temp)

In [0]:
# x_tr.head()

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

## pipe line for the model building

In [49]:
%%time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
max_acc = 0
mnb_clf = Pipeline([('vect',CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                          ('tfidf',TfidfTransformer(use_idf=True,norm='l2')),
                          ('clf',MultinomialNB(alpha=1e-2))])
mnb_clf = mnb_clf.fit(X_train, y_train)
predicted = mnb_clf.predict(X_val)
mnb_accuracy = accuracy_score(y_val,predicted)
print("TEST Conf Matrix : \n", confusion_matrix(y_val, predicted))
print("Classification Report on Test Data")
print(classification_report(y_val,predicted,digits=2))
max_acc = mnb_accuracy if max_acc < mnb_accuracy else max_acc
print("\nMultinomial NB: {:.2%}".format(mnb_accuracy))




dt_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                       ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', DecisionTreeClassifier(random_state=0, criterion='entropy'))])
dt_clf = dt_clf.fit(X_train, y_train)
predicted = dt_clf.predict(X_val)
dt_accuracy = accuracy_score(y_val,predicted)
print("TEST Conf Matrix : \n", confusion_matrix(y_val, predicted))
print("Classification Report on Test Data")
print(classification_report(y_val,predicted,digits=2))
max_acc = dt_accuracy if max_acc < dt_accuracy else max_acc
print("\nDecision Tree: {:.2%}".format(dt_accuracy))




rf_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                       ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', RandomForestClassifier(random_state=0,criterion='entropy'))])
rf_clf = rf_clf.fit(X_train, y_train)
predicted = rf_clf.predict(X_val)
rf_accuracy = accuracy_score(y_val,predicted)
print("TEST Conf Matrix : \n", confusion_matrix(y_val, predicted))
print("Classification Report on Test Data")
print(classification_report(y_val,predicted,digits=2))
max_acc = rf_accuracy if max_acc < rf_accuracy else max_acc
print("\nRandom Forest: {:.2%}".format(rf_accuracy))



#     # et_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
#     #                    ('tfidf', TfidfTransformer(use_idf=True)),
#     #                    ('clf', ExtraTreesClassifier(random_state=0,criterion='entropy'))])
#     # et_clf = et_clf.fit(X_train, y_train)
#     # predicted = et_clf.predict(X_test)
#     # et_accuracy = accuracy_score(y_test,predicted)
#     # max_acc = et_accuracy if max_acc < et_accuracy else max_acc
#     # print("\nExtra Trees: {:.2%}".format(et_accuracy))
svm_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                       ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', svm.SVC(kernel='linear'))])
svm_clf = svm_clf.fit(X_train, y_train)
predicted = svm_clf.predict(X_val)
svm_accuracy = accuracy_score(y_val,predicted)
print("TEST Conf Matrix : \n", confusion_matrix(y_val, predicted))
print("Classification Report on Test Data")
print(classification_report(y_val,predicted,digits=2))
max_acc = svm_accuracy if max_acc < svm_accuracy else max_acc
print("\nSVM: {:.2%}".format(svm_accuracy))



TEST Conf Matrix : 
 [[ 32 197]
 [ 65 477]]
Classification Report on Test Data
              precision    recall  f1-score   support

           0       0.33      0.14      0.20       229
           1       0.71      0.88      0.78       542

    accuracy                           0.66       771
   macro avg       0.52      0.51      0.49       771
weighted avg       0.60      0.66      0.61       771


Multinomial NB: 66.02%
TEST Conf Matrix : 
 [[ 81 148]
 [152 390]]
Classification Report on Test Data
              precision    recall  f1-score   support

           0       0.35      0.35      0.35       229
           1       0.72      0.72      0.72       542

    accuracy                           0.61       771
   macro avg       0.54      0.54      0.54       771
weighted avg       0.61      0.61      0.61       771


Decision Tree: 61.09%
TEST Conf Matrix : 
 [[ 54 175]
 [ 86 456]]
Classification Report on Test Data
              precision    recall  f1-score   support

       

In [0]:
# from sklearn.metrics import classification_report


# print("Classification Report on Test Data")
# print(classification_report(y_val,predicted,digits=2))

## reading the test csv file

In [0]:
test = pd.read_csv("/content/drive/My Drive/final_ data/test_data_phd.csv")

In [51]:
test.head()

Unnamed: 0,id,ticker,cik,filing_date,10k_link,link_text
0,0,GD,40533,2/17/2012,https://www.sec.gov/Archives/edgar/data/40533/...,\nForm 10-K\n\n \n\n\n UNITED STATES SECURIT...
1,1,MTB,36270,3/19/1999,https://www.sec.gov/Archives/edgar/data/36270/...,\n\n\n\n UNIT...
2,2,RRC,315852,3/6/2001,https://www.sec.gov/Archives/edgar/data/315852...,\nRange Resources Corporation Form 10-K for 12...
3,3,LH,920148,2/26/2009,https://www.sec.gov/Archives/edgar/data/920148...,\n \nUNITED STATES\nSECURITIES AND EXCHANGE CO...
4,4,SYY,96021,9/16/2004,https://www.sec.gov/Archives/edgar/data/96021/...,\n\n\n- --------------------------------------...


In [52]:
test.shape

(856, 6)

find <DOCUMENT> tags
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    # Regex to find <TYPE> tag prceeding any characters, terminating at new line
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]
    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]
    document = {}
    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_10k[doc_start:doc_end]
    full_content = bs(document['10-K'], 'lxml')
    find_text = full_content.select('text')
    #print(full_content.prettify())

    div_text_final = ""   
    for ttext in find_text:
        result = scrub_words(ttext.get_text("\n"))
        div_text_final = div_text_final+" "+result 
    #RiskFactor = scrub_words(RiskFactor)
    if div_text_final != "":
        final_data['ticker'].append(ticker)
        final_data['cik'].append(cik)
        final_data['10k_links'].append(10_links)
        final_data['link_text'].append(div_text_final)
        final_data['long_term_outlook'].append(long_term_outlook)
        final_data['filing_date'].append(filing_date)
except:
    pass        

In [54]:
test['link_text'] = [text.strip().lower() for text in test['link_text']]
test['link_text']

0      form 10-k\n\n   \n\n\n united states securitie...
1      united states\n                       securiti...
2      range resources corporation form 10-k for 12/3...
3      united states\nsecurities and exchange commiss...
4      - --------------------------------------------...
5      form 10-k\nunited states\n                    ...
6      html public "-//w3c//dtd html 4.01 transitiona...
7      e10vk\n pagebreak \n\n begin page width \n xbr...
9      kim20161231_10k.htm\n created by rdg html conv...
10     - --------------------------------------------...
11     html document created with certent disclosure ...
12     form 10-k\n\n   united states  securities and\...
13     html public "-//w3c//dtd html 4.01 transitiona...
14     html public "-//w3c//dtd html 4.01 transitiona...
15     html public "-//w3c//dtd html 4.01 transitiona...
16     e10vk\n pagebreak \n\n begin page width \n xbr...
17     securities and exchange commission\n          ...
18     form 10-k\n\nform 10-k\n

In [0]:
import unicodedata
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #https://docs.python.org/2/library/unicodedata.html
    return text

In [56]:
test['link_text'] = [remove_accented_chars(text) for text in test['link_text']]
test['link_text']

0      form 10-k\n\n   \n\n\n united states securitie...
1      united states\n                       securiti...
2      range resources corporation form 10-k for 12/3...
3      united states\nsecurities and exchange commiss...
4      - --------------------------------------------...
5      form 10-k\nunited states\n                    ...
6      html public "-//w3c//dtd html 4.01 transitiona...
7      e10vk\n pagebreak \n\n begin page width \n xbr...
9      kim20161231_10k.htm\n created by rdg html conv...
10     - --------------------------------------------...
11     html document created with certent disclosure ...
12     form 10-k\n\n   united states  securities and\...
13     html public "-//w3c//dtd html 4.01 transitiona...
14     html public "-//w3c//dtd html 4.01 transitiona...
15     html public "-//w3c//dtd html 4.01 transitiona...
16     e10vk\n pagebreak \n\n begin page width \n xbr...
17     securities and exchange commission\n          ...
18     form 10-k\n\nform 10-k\n

In [0]:
 def scrub_words(text):
    #Replace the "coated links" with space 
    text = re.sub('"', ' ', text)
    
    #Replace non ascii / not words and digits
    text = re.sub("(\\W|\\d)",' ',text)
    
    #Replace new line characters and following text untill space
    text = re.sub('\n(\w*?)[\s]', '', text)
    
    #Remove html markup
    text = re.sub("<.*?>", ' ', text)
    
    #Remove extra spaces from the text
    text = re.sub("\s+", ' ', text)
    
     #Remove single character's from the text
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    return text

In [58]:
%%time
test['link_text'] = [scrub_words(text) for text in test['link_text']]
test['link_text'] [:5]

CPU times: user 48.6 s, sys: 115 ms, total: 48.7 s
Wall time: 48.7 s


In [59]:
%%time
url_reg  = r'[a-z]*[:.]+\S+'
test['link_text'] = [re.sub(url_reg, '', text) for text in test['link_text']]
test['link_text'][0]

CPU times: user 35.2 s, sys: 4.92 ms, total: 35.2 s
Wall time: 35.2 s


In [60]:
%%time
test['link_text'] = [text.replace('"', '') for text in test['link_text']]
test['link_text'][0]

CPU times: user 108 ms, sys: 3 ms, total: 111 ms
Wall time: 110 ms


In [0]:
test['link_text'] = test['link_text'].str.replace(r'\d+','')

In [62]:
%%time
test['word_count'] = [len(text.split(' ')) for text in test['link_text']]
# pd.DataFrame(data['word_count']).describe()
test.head(10)

CPU times: user 2.69 s, sys: 15 ms, total: 2.7 s
Wall time: 2.71 s


In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [64]:
## load spacy's English stopwords as variable called 'stopwords'

stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(stopwords))
print('First ten stop words: %s' % list(stopwords)[:50])
#stopwords.remove('no')
#stopwords.remove('not')

Number of stop words: 326
First ten stop words: ['whole', 'because', 'itself', 'yours', 'used', 'show', 'whatever', 'what', 'all', 'hereby', '’ve', 'whom', 'unless', 'during', 'my', 'never', 'now', 'become', 'really', 'elsewhere', 'above', 'its', 'here', 'call', 'themselves', 'therein', 'upon', 'where', 'behind', 'something', 'does', 'more', 'therefore', 'from', 'am', 'nevertheless', 'side', 'just', 'keep', 'latter', 'has', 'this', 'two', 'along', 'had', 'she', 'ourselves', 'take', 'bottom', 'might']


In [65]:
test.head()

Unnamed: 0,id,ticker,cik,filing_date,10k_link,link_text,word_count
0,0,GD,40533,2/17/2012,https://www.sec.gov/Archives/edgar/data/40533/...,form united states securities and exchange co...,40663
1,1,MTB,36270,3/19/1999,https://www.sec.gov/Archives/edgar/data/36270/...,united states securities and exchange commissi...,38359
2,2,RRC,315852,3/6/2001,https://www.sec.gov/Archives/edgar/data/315852...,range resources corporation form for pagebrea...,29903
3,3,LH,920148,2/26/2009,https://www.sec.gov/Archives/edgar/data/920148...,united states securities and exchange commissi...,43521
4,4,SYY,96021,9/16/2004,https://www.sec.gov/Archives/edgar/data/96021/...,united states securities and exchange commiss...,29013


In [66]:
test.shape

(856, 7)

In [67]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=5000,
                                 min_df=0.2,
                                 use_idf=True, ngram_range=(1,4))

tfidf_matrix = tfidf_vectorizer.fit_transform(test['link_text'])

print(tfidf_matrix.shape)

(856, 5000)
CPU times: user 5min 2s, sys: 4.05 s, total: 5min 6s
Wall time: 5min 6s


In [71]:
print(tfidf_matrix.toarray())

[[0.         0.         0.         ... 0.00559284 0.         0.00194218]
 [0.00424221 0.00107779 0.00108033 ... 0.0033782  0.         0.        ]
 [0.00422969 0.00214921 0.00215429 ... 0.00112274 0.00634216 0.00311909]
 ...
 [0.00274164 0.0069655  0.00698196 ... 0.00582201 0.00513867 0.        ]
 [0.         0.00461202 0.00462291 ... 0.         0.01360971 0.0083666 ]
 [0.00154578 0.01413808 0.01417149 ... 0.         0.         0.00683939]]


In [0]:
rf_sample = rf_clf.predict(test)