In [None]:
import os 
os.chdir('input your path to dataset')

In [8]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import re

import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')


def ie_preprocess(document):
#     document = ' '.join([i for i in document.split() if i not in stop])
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences


def extract_names(document):
    names = []
    sentences = ie_preprocess(document)
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence, binary=False):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join([c[0] for c in chunk ]))
                    
    names_set=[]
    for i in names:
        if i in names_set or len(i.split()) < 2 :
            pass
        else:
            names_set.append(i)
    return list(map(str, names_set))


def extract_first_name(document):
    names= re.findall("Dear\s\w+|Mr.\s\w+", document)
    if len(names)> 0:
        for name in names:
            return str(name.split()[1])
    else:
        return 'unavailable'

    
def extract_full_name(document):
    
    full_name=[]
   
    first_name= extract_first_name(document)
    if first_name != 'unavailable':
        name_list= extract_names(document)
        for names in name_list:
            match= re.findall(first_name, names)
            if len(match) >0 :
                full_name.append(names)
    else:
        name_list= extract_names(document)
        for names in name_list:
            match= re.findall("Corporation|Inc", names)
            if len(match) >0:
                pass
            else:
                full_name.append(names)
    
    return full_name

def extract_organization(document):
    document= " ". join([i.lower() for i in document.split()])
    splitted=[]
    for i in document.split():
        i= i.replace("’", "")
        splitted.append(i)
    document= " ".join(splitted)
    try:
        match= re.findall('[a-z]+\s[a-z]+\scorporation|[a-z]+\sCorporation|[a-z]+\s[a-z]+[,]*\s*inc[.]|[a-z]+[,]*\s*inc[.]', document)
        return match[0]
    except:
        return 'unavailable'
    
def extract_base_salary(string):
    try:
        money = re.findall("(\$\d*[,]*\d*[,]*\d{3})", string)
        base_salary=[]
        for salary in money:
            salary= salary.replace('$', "")
            salary= salary.replace(',', "")
            salary= int(salary)
            base_salary.append(salary)
        
        return '$' + str(sorted(base_salary)[::-1][0])
    except:
        money = re.findall("(\d*[,]*\d+[,]\d{3})", string)
        base_salary=[]
        for salary in money:
            salary= salary.replace('$', "")
            salary= salary.replace(',', "")
            salary= int(salary)
            base_salary.append(salary)
        if len(base_salary)>0:
            return base_salary[0]
        else:
            return 'unavailable or in Text Format'

    
def extract_agreement_date(document):
    document= " ". join([i.lower() for i in document.split()])
    try:
        dates= re.findall("(\w+\s[0-9]+[,]\s[0-9]+)", document)

        if len(dates) > 0:
            dates_set=[]
            for date in dates:
                if date in dates_set:
                    pass
                else:
                    dates_set.append(date)    
            return dates_set[0]
    except:
        return 'unavailable'


def extract_role(document):
    document= " ". join([i.lower() for i in document.split()])
    try:
        roles= re.findall("chief financial Officer|chief executive officer|chief operations officer|senior vice president|chief operating Officer|manager| strategic marketing & development advisor| chief procurement officer| vp | vice president| business development", document)
        
        roles_set=[]
        for role in roles:
            if role in roles_set:
                pass
            else:
                roles_set.append(role)
        return roles_set[0:2]
    
    except:
        return 'unavailable'

def file_segregation():
    all_letters= os.listdir()

    employment_letter=[]
    amendment_letter=[]

    for letters in all_letters:
        match= re.findall("AMENDMENT|AMENDED|AMEND.|AMENDED|ADDENDUM|AMENDMEDNT|AM", letters)
        if len(match)>0:
            amendment_letter.append(letters)
        else:
            employment_letter.append(letters)
            
    return employment_letter

        
        
        
# exporting everything to CSV format


if __name__ == "__main__":
    
    column= ['agreement', 'employee_name', 'employer', 'base_salary', 'agreement_date', 'role']
    
    employment_letter= file_segregation()
    
    employment_agreement= pd.DataFrame(columns= column, index=[i for i in range(len(employment_letter))])

    i=0

    for letters in employment_letter:
        text_file = open(letters, "r", encoding="utf8")
        document = text_file.read()


        employment_agreement['agreement'].loc[i]= letters
        employment_agreement['employee_name'].loc[i]= extract_full_name(document)
        employment_agreement['employer'].loc[i]= extract_organization(document)
        employment_agreement['base_salary'].loc[i]= extract_base_salary(document)
        employment_agreement['agreement_date'].loc[i]= extract_agreement_date(document)
        employment_agreement['role'].loc[i]= extract_role(document)

        i= i+1
    
    employment_agreement.to_csv("Features_employment_agreement.csv")
