In [4]:
# Requried imports
import os
import re
import pandas as pd
from nltk.tokenize import RegexpTokenizer, sent_tokenize
import numpy as np
lp_regex = r"item\s?[^a-zA-Z\n][\d.:]*\s*.\s*legal proceedings.*?\s*item[^a-zA-Z\n]*[4]\s*\.*"

In [5]:
# Function for extracting requried text
def rawdata_extract(path, cikListFile):
    html_regex = re.compile(r'<.*?>') #HTML TAGS
        
    df_final = pd.DataFrame()
    cikListFile = pd.read_csv(cikListFile)
    for index, row in cikListFile.iterrows():
        processingFile=row['path'].split('/')
        
        inputFile = processingFile[3]
        
        cik=row['cik']
        coname=row['company']
        fdate = row['date']
        secfname=row['path']
        for fileName in os.listdir(path):
            
            filenameopen = os.path.join(path, fileName)
            currentFile=fileName

            if os.path.isfile(filenameopen) and currentFile == inputFile :
                
                resultdict = dict()
                resultdict['cik'] = [cik]
                resultdict['company'] = [coname]
                resultdict['date'] = [fdate]
                resultdict['path'] = [secfname]
                
                with open(filenameopen, 'r', encoding='utf-8', errors="replace") as in_file:
                    content = in_file.read()
                                        
                    matches_lp = re.findall(lp_regex, content, re.IGNORECASE | re.DOTALL | re.MULTILINE)
                    
                    if matches_lp:
                        
                        result = max(matches_lp, key=len)
                        result = str(result).replace('\n', '')
                        result = str(result).replace('\t', '')
                        
                        resultdict['lp'] = [result]
                    else:
                        resultdict['lp'] = [""]
                    
                    df_temp = pd.DataFrame.from_dict(resultdict)
                    df_final = df_final.append(df_temp,ignore_index=True)
                    
                in_file.close()

    return df_final

In [6]:
#inputDirectory = '/home/anuj/PycharmProjects/733/data_08032020/2018/10-K_clean'
inputDirectory = '/home/anuj/PycharmProjects/733/data_08042020/10-K_clean'
#masterFile = '/home/anuj/PycharmProjects/733/result_uncertain10mar.csv'
masterFile = "/home/anuj/PycharmProjects/733/cikfile_similarity.csv"
#outputDirectory = '/home/anuj/PycharmProjects/733/data/10-K_clean'
dataList = rawdata_extract( inputDirectory , masterFile)
#rawdata_extract( inputDirectory , masterFile, outputDirectory )
#print(dataList)

{'cik': [12927], 'company': ['BOEING CO'], 'date': ['2014-02-14'], 'path': ['edgar/data/12927/0000012927-14-000004.txt'], 'lp': ['item3 . legal proceedingscurrently , involved number legal proceedings . discussion contingencies related legal proceedings , see note 20to consolidated financial statements , hereby incorporated reference.item4 .']}
{'cik': [12927], 'company': ['BOEING CO'], 'date': ['2019-02-08'], 'path': ['edgar/data/12927/0000012927-19-000010.txt'], 'lp': ['item3 . legal proceedingscurrently , involved number legal proceedings . discussion contingencies related legal proceedings , see note 22 consolidated financial statements , hereby incorporated reference.item4 .']}
{'cik': [12927], 'company': ['BOEING CO'], 'date': ['2015-02-12'], 'path': ['edgar/data/12927/0000012927-15-000011.txt'], 'lp': ['item3 . legal proceedingscurrently , involved number legal proceedings . discussion contingencies related legal proceedings , see note 20to consolidated financial statements , he

{'cik': [320193], 'company': ['APPLE INC'], 'date': ['2017-11-03'], 'path': ['edgar/data/320193/0000320193-17-000070.txt'], 'lp': ['item 3. legal proceedingsthe company subject legal proceedings claims fully resolved arisen ordinary course business . opinion management , least reasonable possibility company may incurred material loss , material loss excess recorded accrual , respect loss contingencies asserted legal claims . however , outcome legal proceedings claims brought company subject significant uncertainty . therefore , although management considers likelihood outcome remote , one legal matters resolved company reporting period amounts excess managements expectations , companys consolidated financial statements reporting period could materially adversely affected . see risk factor company could impacted unfavorable results legal proceedings , found infringed intellectual property rights part , item1a form 10-k heading risk factors . company settled certain matters fourth quarte

In [7]:
dataList['date'] =pd.to_datetime(dataList['date'])
dataList = dataList.sort_values(by=['cik','date'])

In [8]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
similarity = []
soft_similarity = []

In [9]:
def jaccard_similarity(query, document):
        intersection = set(query).intersection(set(document))
        union = set(query).union(set(document))
        return len(intersection)/len(union)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

count_vectorizer1 = TfidfVectorizer()

In [11]:
def cosine_generator(coup_sent):
    sparse_matrix = count_vectorizer1.fit_transform(coup_sent)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
                      columns=count_vectorizer1.get_feature_names(), 
                      index=['1', '2'])
    a = cosine_similarity(df, df)
    return(a[0][1])

In [12]:
# GENSIM
import gensim
# upgrade gensim if you can't import softcossim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess

# Download the FastText model
#fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

In [13]:
def softcosine_generator(coup_sent):
    dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in coup_sent])

# Prepare the similarity matrix
    similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

# Convert the sentences into bag-of-words vectors.
    sent_1 = dictionary.doc2bow(simple_preprocess(coup_sent[0]))
    sent_2 = dictionary.doc2bow(simple_preprocess(coup_sent[1]))
    
    return (softcossim(sent_1, sent_2, similarity_matrix))

#sentences = [sent_1, sent_2]

In [14]:
df_final_similarity = pd.DataFrame()
cik_value = [320193,19617,12927]
for i in cik_value:
    temp_df = dataList[dataList['cik'] == i]
    temp_dict = {}
    
    for i in range(5):
        list_str = [temp_df.iloc[i,4],temp_df.iloc[i+1,4]]
        temp_dict['cik'] = [temp_df.iloc[i,0]]
        temp_dict['company'] = [temp_df.iloc[i,1]]
        temp_dict['date'] = [temp_df.iloc[i+1,2]]
        coup_sent = []
        for i in range(len(list_str)):
            #print(list_str[i])
            #print('-------------------')
            words = nltk.word_tokenize(list_str[i])
            words = [lemmatizer.lemmatize(word) for word in words]
            sent = ' '.join(words)
            
            coup_sent.append(sent)
        #sim_score = jaccard_similarity(coup_sent[0],coup_sent[1])
        #[0.8666666666666667, 1.0, 1.0, 1.0, 0.8666666666666667, 0.8666666666666667, 1.0, 1.0, 1.0, 0.8666666666666667]
        
        sim_score = cosine_generator(coup_sent)
        #[0.44066352960318245, 0.9621651214999738, 0.9874426697380431, 0.9523528992672959, 0.44066352960318245]
        temp_dict['cosine_similar'] = [sim_score]
        softsim_score = softcosine_generator(coup_sent)
        temp_dict['gensim_similar'] = [softsim_score]
        #[0.6412587091765587, 0.9857519524291879, 0.9925883778299979, 0.9815659501105083, 0.6412586811273455]
        
        #similarity.append(sim_score)
        #soft_similarity.append(softsim_score)
        df_temp_similarity = pd.DataFrame.from_dict(temp_dict)
        df_final_similarity = df_final_similarity.append(df_temp_similarity,ignore_index=True)

  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()
  """
  # This is added back by InteractiveShellApp.init_path()


In [15]:
df_final_similarity

Unnamed: 0,cik,company,date,cosine_similar,gensim_similar
0,320193,APPLE INC,2015-10-28,0.831597,0.924039
1,320193,APPLE INC,2016-10-26,0.576331,0.770554
2,320193,APPLE INC,2017-11-03,0.986393,0.998828
3,320193,APPLE INC,2018-11-05,0.850658,0.941192
4,320193,APPLE INC,2019-10-31,0.945972,0.985818
5,19617,JPMORGAN CHASE & CO,2015-02-24,0.895532,0.969658
6,19617,JPMORGAN CHASE & CO,2016-02-23,1.0,1.0
7,19617,JPMORGAN CHASE & CO,2017-02-28,0.836315,0.912573
8,19617,JPMORGAN CHASE & CO,2018-02-27,0.890109,1.0
9,19617,JPMORGAN CHASE & CO,2019-02-26,0.890109,0.964975
