In [1]:
# importing required packages
from data_extraction import extract_url_data # script to crawl data from url
import pandas as pd
import itertools
from collections import ChainMap
from bs4 import BeautifulSoup
import requests
import re
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
import os
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options 
from selenium import webdriver
import json
import pycountry
from geotext import GeoText


In [2]:
# importing required data from ISR for automation
tool_df = pd.read_csv(r"D:\Threat_intel_services\TIP_project\ISR\tool_data.csv")
malware_df = pd.read_csv(r"D:\Threat_intel_services\TIP_project\ISR\malware_data.csv")
techniques = pd.read_csv(r"D:\Threat_intel_services\TIP_project\ISR\techniques_data.csv")
groups = pd.read_csv(r"D:\Threat_intel_services\TIP_project\ISR\groups_data.csv")
groups['Name'] = groups['Name'].str.lower()
#importing csv file which contains information about the external references
threat_actor_external_link_data=pd.read_csv(r"D:\Threat_intel_services\TIP_project\ISR\threat_actor_ext_links_data_raw.csv")

In [3]:
#below script is used to find the attack patterns,softwares,tool related to each and every threat group
associated_group=pd.DataFrame()
attacks=pd.DataFrame()
softwares=pd.DataFrame()
for index,row in groups.iterrows():
    
    tables=pd.read_html(row['link']) # crawling tables from the threat group url
    if len(tables) > 0: # if any tables found proceed
        
            r = requests.get(row['link'])
            soup = BeautifulSoup(r.text, 'html.parser')
            table_name = soup.findAll('h2',class_='pt-3')
            table_name = [i.text for i in table_name[:-1]]
            
            if len(tables) == len(table_name):
                
                if 'Techniques Used' in table_name:
                    df_1=tables[table_name.index('Techniques Used')]
                    df_1['group']=row['Name']
                    attacks=attacks.append(df_1)
                if  'Software' in table_name:
                    df_2=tables[table_name.index('Software')]
                    df_2['group']=row['Name']
                    softwares=softwares.append(df_2)
                    
               

In [4]:
# below is the function to find the deatils about particular group in mitre by using above dataframes related to each group
def details_group(group_name):
    
    group_technique=attacks[attacks['group']==group_name]
    group_softwares=softwares[softwares['group']==group_name]
    ids=list(group_technique['ID'])
    name=list(group_softwares['Name'])

    
    group_techniques=techniques[techniques['ID'].isin(ids)].reset_index().drop('index',axis=1)
    group_malware = malware_df[malware_df['Name'].isin(name)].reset_index().drop('index',axis=1)
    group_tool = tool_df[tool_df['Name'].isin(name)].reset_index().drop('index',axis=1)
    
    
    return group_techniques,group_malware,group_tool   

In [5]:
selected_groups = ['APT1','APT12','BlackOasis','Cobalt Group', 'APT28','apt33','sowbug','fin4']
selected_groups = [i.lower() for i in selected_groups]

In [6]:
# below funtion is used to find the hash values in a text by regex patterns
def extract_hashes(text):
    regex_list = {

    'wordpress_md5': '\$P\$[\w\d./]+',
    'phpBB3_md5': '\$H\$[\w\d./]+',
    'sha1':  '(?<!\w)[a-fA-F\d]{40}(?!\w)',
    'md5':  '(?<!\w)[a-fA-F\d]{32}(?!\w)',
    'sha256':  '(?<!\w)[a-fA-F\d]{64}(?!\w)',
    'sha512':  '(?<!\w)[a-fA-F\d]{128}(?!\w)',
    'mysql':  '(?<!\w)[a-fA-F\d]{16}(?!\w)',
    'mysql5': '\*[A-F\d]{40}'

    }

    result = {}

    for format in regex_list.keys():
        hashes = []
        regex = re.compile(regex_list[format])
        hashes = regex.findall(text,re.I)
        if hashes:
            result[format] = hashes
    all_hashes=[]
    for i in result.keys():
        hash=['File-'+i.upper()+'--'+j for j in result[i]]
        all_hashes.append(hash)

    all_hashes=list(itertools.chain(*all_hashes))
    return all_hashes

In [7]:
def extract_ip(text):
    ip=[]
    p= re.compile("(\d{1,3}(\[\.\]|\.)\d{1,3}(\[\.\]|\.)\d{1,3}(\[\.\]|\.)\d{1,3})")
    result = p.findall(text)
    for i in result:
        ip.append(i[0].replace('[','').replace(']',''))
    return ip

In [8]:
# below function is used to find the cve_id in text
def CVE_ID(text):

    pattern= 'CVE-\d{4}-\d{4,7}'
    result = re.findall(pattern, text ,re.I)
    result=set(result)
    return (result)

In [9]:
# below function acts as API and crawls the cve_id description from NVD database 
# this function expects cve_id in a list
def cve_id_details(cve):
    details=dict()
    if len(cve) > 0: # checking for CVE id it proceeds if only hashes present
        for cve_id in cve:
            link = 'https://nvd.nist.gov/vuln/detail/'+cve_id.upper()
            response = requests.get(link)
            html = BeautifulSoup(response.text,'html.parser')
            try:
                description =html.find('p',attrs={'data-testid':'vuln-description'}).text
            except AttributeError:
                description =None
            details[cve_id.upper()]=description
        df=pd.DataFrame(details.items(),columns=['name','description'])
        df.dropna(inplace=True)
    else:
        df=pd.DataFrame(data=None,columns=['name','description'],index=[0])
    return df
        

In [10]:
# below function to create a DF of indicators 
# this function expects hashes as 
def observables(ioc):
    if len(ioc)>0: # checking for hashes it proceeds if only hashes present
        df = pd.DataFrame(ioc)
        df[0]=df[0].str.lower()
        df[['type','observable_value']] = df[0].str.split("--",expand=True) 
        df.drop(columns=0,axis=1,inplace=True)
    else:
        df=pd.DataFrame(data=None,columns=['type','observable_value'],index=[0])
    return df

In [11]:
# below function to create a DF of indicators 
# this function expects hashes as 
def identity(identities):
    if len(identities)>0: # checking for hashes it proceeds if only hashes present
        df = pd.DataFrame(identities)
        df[0]=df[0].str.lower()
        df[['type','name']] = df[0].str.split("-",expand=True) 
        df.drop(columns=0,axis=1,inplace=True)
    else:
        df=pd.DataFrame(data=None,columns=['type','name'],index=[0])
    return df

In [12]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler


nlp = spacy.load('en_core_web_sm')
new_ruler = EntityRuler(nlp).from_disk(r"spacy/sector_corpus.jsonl")
nlp.add_pipe(new_ruler)

In [21]:
selected_groups = ['naikon']

In [35]:
tool_df_col=malware_df.columns 
tech_df_col= techniques.columns

# headings dictionary according to opencti requirements
headings = {'Name':'name','Description':'description','Associated Software':'alias','Associated Groups':'alias','ID':'external_id','External_references':'external_references'}
not_cities=['Most','Police','March','Of','Mobile','city-Of','Same','Manage','Best','University','Enterprise','Along','Date','Buy']
# specifying directory to store extracted details of a threat group
parent_dir=r"D:\Threat_intel_services\TIP_project\ISR\automated_collected_mitre_data"
# iterating groups dataframe

report_col=['name','description','published','Internal Report','object_status','source_confidence_level','marking_definitions','created_by_ref']


for index,row in groups.iterrows():

    cve_details=[]
    all_hashes=[]
    all_ip=[]
    countries=[]
    sectors=[]
    org=[]
    cities=[]
    if row['Name'] in selected_groups: # checking for selected threat groups
        
        report_df=pd.DataFrame(data=None,columns=report_col,index=[0])
        print(report_df['name'][0])
        report_df['name']='fake_'+row['Name']
        report_df['description'][0]='Sample description regrding '+str(row['Name'])
        report_df['Internal Report'][0] ='Internal Report'
        report_df['object_status'][0] =0
        report_df['source_confidence_level'][0] =1
        
        threat_Actor=groups[groups['Name']==row['Name']]
        tech,malware,tool=details_group(group_name=row['Name']) # this function returns techniques,tool and malware for particualr group
       
        if len(malware) !=0: # if malwares details present write to DF else write to empty DF
            mal_data=malware
        elif len(malware)==0:          
            mal_data=pd.DataFrame(data=None,columns=tool_df_col,index=[0])

        if len(tool) !=0: # if tools details present write to DF else write to empty DF
            tool_data=tool
        elif len(tool)==0:
            tool_data=pd.DataFrame(data=None,columns=tool_df_col,index=[0])

        if len(tech) !=0: # if techniques details present write to DF else write to empty DF
            tech_data=tech
        elif len(tech)==0:
            tech_data=pd.DataFrame(data=None,columns=tech_df_col,index=[0])
        
        
        directory=row['Name']
        path = os.path.join(parent_dir, directory) # joining the path to be stored and new group name folder to store details
        if  not os.path.isdir(path): # creates a path if not exists
            os.mkdir(path)
        
        links =row['external_references'].split('\n') # splitting of rows
        initial = True
        for link in links:
            # importing the data for particular link in threat_actor_external_link_data
            if initial:
                text = row['Description']
                print(text)
                initial = False
            else:
                if link in list(threat_actor_external_link_data['url']):
                    text = list(threat_actor_external_link_data['data'][threat_actor_external_link_data['url']==link])[0]
                
            if str(type(text))!= '<class \'float\'>':
                
                doc=nlp(text)
                for ent in doc.ents:
                        if ent.label_=='ORG':
                            org.append(ent.text.lower())
                        if ent.label_=='sector':
                            sectors.append(ent.text.lower())
                        
                        countries.append(GeoText(text).countries)
                        cities.append(GeoText(text).cities)
                 # if data present  for particular link
                
                temp_cve=CVE_ID(text) # extracting cve_id in text
                d=extract_hashes(text) # extracting hashes in text
                ip = extract_ip(text)
                all_hashes.append(d)
                cve_details.append(temp_cve)
                all_ip.append(ip)
                
        cve_details= list(itertools.chain(*cve_details))
        cve_details=[i.upper() for i in cve_details if len(i)>0]
        
        all_hashes= list(itertools.chain(*all_hashes))
        all_hashes=[i for i in all_hashes if len(i)>0]
        
        all_ip = list(itertools.chain(*all_ip))
        all_ip=['IP--'+str(i) for i in all_ip if len(i)>0]
        
        cve_details = list(set(cve_details))
        all_hashes = list(set(all_hashes))
        all_ip = list(set(all_ip))
        
        
        vulnerability = cve_id_details(cve_details) # crawls cve details from NVD database and create a DF
        indicators = observables(all_hashes) # create  a indiator DF
        indicators = indicators.append(observables(all_ip))
        
        countries = list(itertools.chain(*countries))
        countries=['country-'+str(i).replace('-','') for i in set(countries)]
        cities = list(itertools.chain(*cities))
        cities=['city-'+str(i).replace('-','') for i in set(cities) if i not in not_cities]
        sectors=['sector-'+str(i).replace('-','') for i in set(sectors)]
        org=['organization-'+str(i).replace('-','') for i in set(org)]
        identities = identity(sectors)
        identities = identities.append(identity(countries))
        identities = identities.append(identity(cities))
        
        
        report_df.to_csv(path+os.sep+'report.csv',index=False)
        
        TA=pd.DataFrame(data=None,columns=['name','description','alias','goal','sophistication','resource_level','primary_motivation','secondary_motivation','personal_motivation','created_by_ref','marking_definitions'])
        threat_Actor.rename(columns=headings,inplace=True)
        threat_Actor = TA.append(threat_Actor)
        threat_Actor.to_csv(path+os.sep+'threat_actor.csv',index=False)
        
        AP=pd.DataFrame(data=None,columns=['name','description','alias','platform','required_permission','external_id','created_by_ref','marking_definitions'])
        tech_data.rename(columns=headings,inplace=True)
        tech_data = AP.append(tech_data)
        tech_data.to_csv(path+os.sep+'attack_pattern.csv',index=False)
        
        malware = pd.DataFrame(data=None,columns=['name','description','alias','created_by_ref','marking_definitions'])
        mal_data.rename(columns=headings,inplace=True)
        mal_data = malware.append(mal_data)
        mal_data.to_csv(path+os.sep+'malwares.csv',index=False)
        
        tools = pd.DataFrame(data=None,columns=['name','description','alias','created_by_ref','marking_definitions'])
        tool_data.rename(columns=headings,inplace=True)
        tool_data = tools.append(tool_data)
        tool_data.to_csv(path+os.sep+'tools.csv',index=False)
        
        vulnerability['alias']=None
        vulnerability['marking_definitions']=None
        vulnerability['external_references']=None
        vulnerability.to_csv(path+os.sep+'vulnearbility.csv',index=False)
        
        indicators.to_csv(path+os.sep+'indicators.csv',index=False)
        
        #print(countries)
        print(cities)
        Identity = pd.DataFrame(data=None,columns=['type','name','description','alias','created_by_ref','marking_definitions'])
        Identity = Identity.append(identities)
        Identity.to_csv(path+os.sep+'identity.csv',index=False)

nan
Naikon is a threat group that has focused on targets around the South China Sea. The group has been attributed to the Chinese People’s Liberation Army’s (PLA) Chengdu Military Region Second Technical Reconnaissance Bureau (Military Unit Cover Designator 78020). While Naikon shares some characteristics with APT30, the two groups do not appear to be exact matches.
['city-Zhongxing', 'city-Yangon', 'city-Beijing', 'city-Obama', 'city-Shenyang', 'city-Bangkok', 'city-Denver', 'city-Santa Ana', 'city-ShangriLa', 'city-Lanzhou', 'city-Phnom Penh', 'city-Bandung', 'city-Jakarta', 'city-Washington', 'city-Worms', 'city-Seoul', 'city-Putrajaya', 'city-Jinan', 'city-Tokyo', 'city-Guangzhou', 'city-Chengdu', 'city-Asia', 'city-Kunming', 'city-Nanjing', 'city-New York', 'city-Hanoi']


In [None]:
not_cities=['Most','Police','March','Of','Mobile','city-Of','Same','Manage','Best','University','Enterprise','Along','Date','Buy']

In [36]:
mal_data=pd.read_csv(r"D:\Threat_intel_services\TIP_project\ISR\malware_pdf\malware_raw_data_1.1.csv")

In [39]:
def extract_indicator_and_vulnerability(df,data_df):
    
    headings = {'Name':'name','Description':'description','Associated Software':'alias','Associated Groups':'alias','ID':'external_id','External_references':'external_references'}
    df.rename(columns=headings,inplace=True)
    data_heading ={'0':'url','1':'data'}
    data_df.rename(columns=data_heading,inplace=True)
    
    df['CVE']=None
    df['ip']=None
    df['Hash']=None
    for index,row in df.iterrows():
#         if count<1:
            links=row['external_references'].split('\n')
            initial=True
            cve=[]
            ip=[]
            all_hashes=[]
            for link in links:
                # importing the data for particular link in threat_actor_external_link_data
                if initial:
                    text = row['description']
                    initial = False
                else:
                    if link in list(data_df['url']):
                        text = list(data_df['data'][data_df['url']==link])[0]

                if str(type(text))!= '<class \'float\'>':
                        cve_details=CVE_ID(text)
                        ip_details = extract_ip(text)
                        hashes = extract_hashes(text)
                        if len(cve_details) >0:
                            cve.extend(list(cve_details))
                        if len(ip_details) >0:
                            ip.extend(list(ip_details))
                        if len(hashes) >0:
                            all_hashes.extend(list(hashes))
                            
            if len(cve)>0:
                df['CVE'][index]=str(list(set(cve))).replace('[','').replace(']','')
          
            if len(ip)>0:
                df['ip'][index]=str(list(set(ip))).replace('[','').replace(']','')
            
            if len(all_hashes)>0:
                df['Hash'][index]=str(list(set(all_hashes))).replace('[','').replace(']','')

    return df

In [40]:
a= extract_indicator_and_vulnerability(malware_df,mal_data)

In [44]:
a[a['CVE'].notnull()]

Unnamed: 0,name,alias,description,type,external_references,link,CVE,ip,Hash
4,ADVSTORESHELL,"AZZY, EVILTOSS, NETUI, Sedreco",ADVSTORESHELL is a spying backdoor that has be...,MALWARE,https://securelist.com/sofacy-apt-hits-high-pr...,https://attack.mitre.org/software/S0045/,"'CVE-2017-0262', 'CVE-2017-0263'","'80.255.3.93', '10.30.0.47', '31.220.43.99', '...","'File-MD5--237e6dcbc6af50ef5f5211818522c463', ..."
14,AutoIt backdoor,,AutoIt backdoor is malware that has been used ...,MALWARE,https://www.forcepoint.com/sites/default/files...,https://attack.mitre.org/software/S0129/,"'CVE-2014-6352', 'CVE-2018-20250'","'192.119.15.42', '91.230.121.143', '8.26.21.22...",'File-SHA256--ae1d75a5f87421953372e79c081e4b0a...
15,Azorult,,Azorult is a commercial Trojan that is used to...,MALWARE,https://researchcenter.paloaltonetworks.com/20...,https://attack.mitre.org/software/S0344/,'CVE-2018-4878','205.185.121.209','File-SHA256--6071511eea15d5b1d9d8bf9803ad71b3...
16,BabyShark,,BabyShark is a Microsoft Visual Basic (VB) scr...,MALWARE,https://unit42.paloaltonetworks.com/new-babysh...,https://attack.mitre.org/software/S0414/,'CVE-2018-8174','173.248.170.149','File-SHA256--d50a0980da6297b8e4cec5db0a877363...
20,BADNEWS,,BADNEWS is malware that has been used by the a...,MALWARE,https://www.forcepoint.com/sites/default/files...,https://attack.mitre.org/software/S0128/,"'CVE-2015-2545', 'CVE-2017-0261'","'185.203.118.115', '192.168.217.141', '94.156....","'File-MD5--e3e7e71a0b28b5e96cc492e636722f73', ..."
27,BLACKCOFFEE,,BLACKCOFFEE is malware that has been used by s...,MALWARE,https://www2.fireeye.com/rs/fireye/images/APT1...,https://attack.mitre.org/software/S0069/,'CVE-2017-11882',,"'File-MD5--bd9e4c82bf12c4e7a58221fc52fed705', ..."
28,BlackEnergy,Black Energy,BlackEnergy is a malware toolkit that has been...,MALWARE,https://www.f-secure.com/documents/996508/1030...,https://attack.mitre.org/software/S0089/,'CVE-2014-4114',"'78.46.40.239', '212.124.110.62', '146.0.74.7'...","'File-MD5--8a7c30a7a105bd62ee71214d268865e3', ..."
50,China Chopper,,China Chopper is a Web Shell hosted on Web ser...,MALWARE,https://www.fireeye.com/blog/threat-research/2...,https://attack.mitre.org/software/S0020/,"'CVE-2017-0144', 'CVE-2017-11882', 'CVE-2019-0...","'10.0.0.0', '185.12.45.134'","'File-MD5--bd9e4c82bf12c4e7a58221fc52fed705', ..."
51,CHOPSTICK,"Backdoor.SofacyX, SPLM, Xagent, X-Agent, webhp",CHOPSTICK is a malware family of modular backd...,MALWARE,https://www.fireeye.com/content/dam/fireeye-ww...,https://attack.mitre.org/software/S0023/,"'CVE-2015-1701', 'CVE-2015-5119', 'CVE-2016-78...","'191.101.31.96', '89.187.150.44', '80.255.3.93...","'File-MD5--8c4d896957c36ec4abeb07b2802268b9', ..."
58,CORESHELL,"Sofacy, SOURFACE",CORESHELL is a downloader used by APT28. The o...,MALWARE,https://www.fireeye.com/content/dam/fireeye-ww...,https://attack.mitre.org/software/S0137/,"'CVE-2015-1701', 'CVE-2015-5119', 'CVE-2016-78...",,"'File-MD5--504182aaa5575bb38bf584839beb6d51', ..."


In [None]:
b=extract_indicator_and_vulnerability(groups,threat_actor_external_link_data)