In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
#cd 10k Scraping

In [3]:
from tika import parser

In [4]:
statements = ['whirlpool_2005.pdf',  'whirlpool_2010.pdf',  'whirlpool_2015.pdf',
'whirlpool_2001.pdf',  'whirlpool_2006.pdf',  'whirlpool_2011.pdf',  'whirlpool_2016.pdf',
'whirlpool_2002.pdf',  'whirlpool_2007.pdf',  'whirlpool_2012.pdf',  'whirlpool_2017.pdf',
'whirlpool_2003.pdf',  'whirlpool_2008.pdf',  'whirlpool_2013.pdf',  'whirlpool_2018.pdf',
'whirlpool_2004.pdf',  'whirlpool_2009.pdf',  'whirlpool_2014.pdf',  'whirlpool_2019.pdf']

In [5]:
raw = []
for k in statements:
    temp = parser.from_file(k)
    temp = temp['content']
    temp = re.sub("\b\d+\b", " ", temp) # remove digits
    temp = re.sub("\.", " ", temp) # remove dots
    temp = re.sub("\([A-Z]\)", " ", temp) # remove (LETTER)
    temp = re.sub("\([0-9]\)", " ", temp) # remove (NUMBER)
    temp = re.sub("\n", " ", temp) # remove "\n"
    raw.append(temp)

In [6]:
raw[1]

'                                   10-K 1 d10k htm FORM 10-K   UNITED STATES SECURITIES AND EXCHANGE  COMMISSION Washington, D C  20549  FORM 10-K (Mark One)     xx  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE       SECURITIES EXCHANGE ACT OF 1934  For the fiscal year ended December 31, 2009  OR     ¨̈  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE       SECURITIES EXCHANGE ACT OF 1934  For the transition period from              to              Commission file number 1-3932  WHIRLPOOL CORPORATION (Exact name of registrant as specified in its charter)     Delaware  38-1490038 (State of Incorporation)  (I R S  Employer Identification No )  2000 North M-63, Benton Harbor, Michigan  49022-2692 (Address of principal executive offices)  (Zip Code)  Registrant’s telephone number, including area code (269) 923-5000  Securities registered pursuant to Section 12(b) of the Act:    Title of each class  Name of each exchange on which registered Common stock, par value $1 00 pe

In [7]:
len(raw)

19

In [8]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [9]:
sim_scores = []

counter_a = 0
counter_b = 1

while counter_a != len(raw):
    while counter_b != len(raw):
        sim_scores.append(get_cosine_sim(raw[counter_a],raw[counter_b])[0][1])
        counter_a += 1
        counter_b += 1
    break

In [10]:
sim_scores = pd.Series(sim_scores)

In [11]:
years = np.arange(2002,2020)

In [12]:
years = pd.Series(years)

In [13]:
df = pd.DataFrame({'Year' : years, 'Similiarity Score' : sim_scores})

In [14]:
df = df[['Year', 'Similiarity Score']]

In [15]:
df.head()

Unnamed: 0,Year,Similiarity Score
0,2002,0.93807
1,2003,0.977853
2,2004,0.90377
3,2005,0.931641
4,2006,0.950061


In [16]:
prices = pd.read_csv('whp_closing_prices.csv')

In [17]:
prices = prices.sort_values(by = 'Date')

In [18]:
prices = prices.iloc[1:,:]

In [19]:
prices.head()

Unnamed: 0,Date,Close*
17,1-Jan-02,72.7
16,1-Jan-03,51.97
15,1-Jan-04,75.95
14,1-Jan-05,68.26
13,1-Jan-06,80.68


In [20]:
prices.columns = ['Date', 'Close']

In [21]:
prices.reset_index(inplace = True)

In [22]:
prices.drop('index', axis = 1, inplace = True)

In [23]:
prices.head()

Unnamed: 0,Date,Close
0,1-Jan-02,72.7
1,1-Jan-03,51.97
2,1-Jan-04,75.95
3,1-Jan-05,68.26
4,1-Jan-06,80.68


In [24]:
prices.drop('Date', axis = 1, inplace = True)

In [25]:
prices.head()

Unnamed: 0,Close
0,72.7
1,51.97
2,75.95
3,68.26
4,80.68


In [26]:
df.set_index('Year', inplace = True)

In [27]:
df = df.pct_change()

In [28]:
df = df.iloc[1:,:]

In [29]:
df.reset_index(inplace = True)

In [30]:
df['Similiarity Score'] = df['Similiarity Score'].abs()

In [31]:
df.head()

Unnamed: 0,Year,Similiarity Score
0,2003,0.042409
1,2004,0.07576
2,2005,0.030839
3,2006,0.019771
4,2007,0.029565


In [32]:
prices = prices.pct_change()

In [33]:
prices = prices.iloc[1:,:]

In [34]:
prices.reset_index(inplace = True)

In [35]:
prices.drop('index', axis = 1, inplace = True)

In [36]:
prices['Close'] = prices['Close'].abs()

In [37]:
prices['Close'] = prices['Close'].apply(lambda x: x*100)

In [38]:
df['Close'] = prices

In [39]:
df['Similiarity Score'] = df['Similiarity Score'].apply(lambda x : x*100)

In [40]:
df.head(10)

Unnamed: 0,Year,Similiarity Score,Close
0,2003,4.240911,28.514443
1,2004,7.576023,46.142005
2,2005,3.083872,10.125082
3,2006,1.977116,18.195136
4,2007,2.956528,13.324244
5,2008,17.01844,6.71552
6,2009,3.149251,60.804315
7,2010,15.271058,124.887825
8,2011,1.586299,13.727055
9,2012,7.659905,36.467836


In [41]:
df['Similiarity Score'].corr(df['Close'])

0.14924348501962417