In [1]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np

In [22]:
stock_info_df = pd.read_csv('stock_complete_info.csv', index_col=0)

In [3]:
# do text feature extraction
# get bigrams tf_idf

company_summary_text = stock_info_df['longBusinessSummary'].values

In [4]:
# remove special characters
def remove_string_special_characters(s):

    # removes special characters with ' '
    stripped = re.sub('[^a-zA-z\s]', '', s)
    stripped = re.sub('_', '', stripped)

    # Change any white space to one space
    stripped = re.sub('\s+', ' ', stripped)

    # Remove start and end white spaces
    stripped = stripped.strip()
    if stripped != '':
            return stripped.lower()

In [5]:
# Stopword removal and stemmer
stop_words = set(stopwords.words('english'))
overused_words = ['company', 'founded', 'inc', 'provide', 'formerly', 'known', 'offer', 'also', '']

ps = PorterStemmer()
for i, line in enumerate(company_summary_text):
    line = remove_string_special_characters(line)
    company_summary_text[i] = ' '.join([ps.stem(x) for x in word_tokenize(line) if ( x not in stop_words ) and (x not in overused_words)])


In [6]:
# tf idf vectorizer
vectorizer = TfidfVectorizer(ngram_range = (2, 2))
tf_idf_text = vectorizer.fit_transform(company_summary_text)
features = vectorizer.get_feature_names()


In [11]:
# Getting top ranking bigrams
sums = tf_idf_text.sum(axis = 0)
data1 = []
for col, term in enumerate(features):
    data1.append( (term, sums[0, col] ))
ranking = pd.DataFrame(data1, columns = ['term', 'tf_idf'])
words = (ranking.sort_values('tf_idf', ascending = True))

term_dict = dict.fromkeys(words['term'].values)
term_dict = {" ".join(sorted(key.split(" "))):term_dict[key] for key in term_dict}
term_rem_dups = list(term_dict.keys())
words = words[words['term'].isin(term_rem_dups)]
words = (ranking.sort_values('tf_idf', ascending = False))

top_500_words = words.head(500)
print ("\n\nWords : \n", top_500_words)


del term_dict
del sums
del data1
del ranking
del words
del term_rem_dups



Words : 
                               term      tf_idf
86978                   real estat  102.480369
111380                  unit state   92.949862
93673                segment offer   70.457489
68056                     natur ga   66.116184
68909                     new york   57.819850
...                            ...         ...
8779                  bank financi    7.111382
18328   clinicalstag biopharmaceut    7.102058
34400                     end user    7.100784
47405                    hold oper    7.097708
28341                deposit offer    7.089645

[500 rows x 2 columns]


In [15]:
# convert to df
tf_idf_df = pd.DataFrame(tf_idf_text.toarray(), columns = features)
del tf_idf_text

In [16]:
# keep only top 500
tf_idf_df = tf_idf_df.loc[:, top_500_words['term'].values]

In [33]:
# drop summary text
stock_info_df.drop(columns = 'longBusinessSummary', inplace=True)

In [30]:
# add bigrams as features
stock_info_df = stock_info_df.reset_index().join(tf_idf_df)

In [32]:
# set index back
stock_info_df = stock_info_df.set_index('index')

In [34]:
# convert to .csv
stock_info_df.to_csv('stock_complete_info_bigrams.csv')

