# Cyber Security Tweets Preprocessing

In [1]:
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
sns.set(rc={'figure.figsize':(6,8)}) 

import warnings
warnings.simplefilter("ignore")

%matplotlib inline 

from sklearn.preprocessing import LabelEncoder 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.metrics import SCORERS
import re

import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.probability import FreqDist

## Functions

In [2]:
def cleanTxt(txt):
    if not isinstance(txt, str):
        txt = str(txt)
    txt = re.sub(r'[^\w\s]','', txt) #remove everything except words and space
    txt = txt.lower()
    txt = re.sub(r'RT[\s]+', '', txt) #removing RT
    txt = re.sub(r'_','', txt) #how to remove underscore as well
    if 'https' in txt: # removing hyperlinks 
        pos = txt.find('https')
        txt = txt[:pos]
    return txt

In [3]:
wnet = WordNetLemmatizer()

def stopWords(txt): 
    txt_lst = txt.split(" ")
    txt_lst = [word for word in txt_lst if word not in stopwords.words('english')]
    txt_lst = [wnet.lemmatize(word) for word in txt_lst]
    txt_lst = [word for word in txt_lst if word != '']
    txt_lst = [word for word in txt_lst if word != '\r\n\r\n'] # indicates space and indexing
    txt_lst = [word for word in txt_lst if word != 'u0001f449'] # indicates back hand emoji pointing right
    txt_lst = [word for word in txt_lst if word != 'cybersecurity']
    txt_lst = [word for word in txt_lst if word != 'cyber']
    txt_lst = [word for word in txt_lst if word != 'security']
    return " ".join(txt_lst)

In [96]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
geolocator = Nominatim(user_agent = "geoapiExercises")
geocode = RateLimiter(geolocator.geocode, max_retries=2, swallow_exceptions=True, return_value_on_exception=None)
def findCountry(txt):
    if txt is np.nan:
        return 'Unknown'
    txt = re.sub(r'[^\w\s]','', txt) #remove everything except words and space
    if 'global' in txt.lower() or 'worldwide' in txt.lower(): # over 100 entries with global or worldwide as location
        return 'Global'
    txt_lst = txt.split(" ")
    for word in txt_lst:  # a few entries that return unknown when a number is present (regex doesn't remove it for some reason)
        if any(ch.isdigit() for ch in word):
            txt_lst.remove(word)
    txt = " ".join(txt_lst)
    try:
        x = geolocator.geocode(txt, )
        if x is None:
            return 'Unknown'
        loc = x.address.split(',')[-1]
        if loc[0] == " ":
            loc = loc[1:]
        return loc
    except:
        return 'Unknown'

In [119]:
re.sub(r'[^\w\s]','', 'Brussels, Belgique <U+0001F1E7><U+0001F1EA><U+...')
findCountry('BC Canada <U+0001F341>')

'Canada'

In [7]:
def splitData(data, len_of_frame):
    n = len(data.index) 
    split_n = round(n/len_of_frame, 0)
    print(split_n)
    splits = [int(x) for x in range(int(split_n))]
    lst = []
    for splt in splits:
        if splt == splits[-1]:
            lst.append(data[int(splt * len_of_frame):])
        else:
            lst.append(data[int(splt * len_of_frame):int(len_of_frame * (1 + splt))])
    return lst

## Data

In [8]:
data = pd.read_csv('cybersecurity_tweets.csv')

In [9]:
data = data.drop_duplicates('text')

In [10]:
data.shape

(17656, 90)

In [141]:
data.tail()

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url,day_post_created,year_account_created
17949,x14478142,x1403323185206992900,2021-06-11 12:08:14,pankajparikh,mahekmshah doubt even understand meaning cyber...,Twitter for Android,275,x1403268597913329664,x72792070,MahekMShah,...,2008-04-22 18:20:52,False,https://t.co/CNV9mneQ38,http://www.Flamboyz.com,,https://pbs.twimg.com/profile_banners/14478142...,http://abs.twimg.com/images/themes/theme9/bg.gif,http://pbs.twimg.com/profile_images/1141389628...,11,2008
17950,x2491363200,x1403323018592542724,2021-06-11 12:07:34,ProsperoEvents,klaus mochalski ceo rheboiiot explains energy ...,HubSpot,229,,,,...,2014-05-12 12:23:08,False,https://t.co/z0yAE7I3aW,https://www.prosperoevents.com/,,https://pbs.twimg.com/profile_banners/24913632...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/5677074432...,11,2014
17951,x928636804487622656,x1403323000762548224,2021-06-11 12:07:30,Dr_Alisherbaz,university gloucestershire interested computin...,Twitter for iPhone,278,,,,...,2017-11-09 14:54:02,False,https://t.co/3npWvhJfjm,https://www.glos.ac.uk/visit/pages/staff-conta...,,https://pbs.twimg.com/profile_banners/92863680...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1135857707...,11,2017
17952,x380464757,x1403322980302831620,2021-06-11 12:07:25,primekeyPKI,starting venture back headquarters vaccine goo...,Twitter Web App,126,,,,...,2011-09-26 17:59:41,False,https://t.co/2umI6kZFXp,https://www.primekey.com,,https://pbs.twimg.com/profile_banners/38046475...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/8575896155...,11,2011
17953,x176452458,x1403322975978459136,2021-06-11 12:07:24,WorkplacebyOS33,morley ivers blog look threat personal liabili...,HubSpot,280,,,,...,2010-08-09 15:41:44,False,https://t.co/giwOWea4HY,http://www.OS33.com,,https://pbs.twimg.com/profile_banners/17645245...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1207397056...,11,2010


In [12]:
len(data.user_id.unique()) # number of unique individuals

7635

In [147]:
data[data.user_id == 'Marky_Sparky_Twerpy_action']

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url,day_post_created,year_account_created


## Data Cleaning

### Text Data

In [13]:
%%time 
data['text'] = data['text'].apply(cleanTxt)
data['text'] = data['text'].apply(stopWords)
data[['text']].head()
# 44.5s

CPU times: user 35.7 s, sys: 8.67 s, total: 44.3 s
Wall time: 44.5 s


Unnamed: 0,text
0,sign cause solution network data breach cyber...
1,battling chinese big tech encroachment india ...
2,ask dr jeanette success way devil choke chain ...
3,threat digital pakistan cyberattack
4,pakistan attack news pakistan army claim major...


### Description Data

In [14]:
%%time
data['description'] = data['description'].apply(cleanTxt)
data['description'] = data['description'].apply(stopWords)
data[['description']].head()
 # 29.1s

CPU times: user 23.4 s, sys: 5.66 s, total: 29.1 s
Wall time: 29.1 s


Unnamed: 0,description
0,leader news hacker hacking techjobs nationalcy...
1,leader news hacker hacking techjobs nationalcy...
2,leader news hacker hacking techjobs nationalcy...
3,leader news hacker hacking techjobs nationalcy...
4,leader news hacker hacking techjobs nationalcy...


### Location to Country

In [15]:
# I made a function to split the data into smaller dataframes
# It isn't going to run faster, but at least we can run it in
# chunks

In [16]:
df1 = splitData(data, 1700)[0]
df2 = splitData(data, 1700)[1]
df3 = splitData(data, 1700)[2]
df4 = splitData(data, 1700)[3]
df5 = splitData(data, 1700)[4]
df6 = splitData(data, 1700)[5]
df7 = splitData(data, 1700)[6]
df8 = splitData(data, 1700)[7]
df9 = splitData(data, 1700)[8]
df10 = splitData(data, 1700)[9]
# the numbers just indicates how many data frames there are
# it gets printed everything the function is run

10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0


In [18]:
df4[['location']].iloc[25:].head()

Unnamed: 0,location
5296,Montréal (Québec)
5297,Montréal (Québec)
5298,"Arlington, VA"
5299,"Arlington, VA"
5300,127.0.0.1


In [19]:
df6.shape

(1700, 90)

In [20]:
%%time 
df1['country'] = df1['location'].apply(findCountry)
# 13min 5s

CPU times: user 4.5 s, sys: 480 ms, total: 4.98 s
Wall time: 13min 5s


In [21]:
%%time 
df2['country'] = df2['location'].apply(findCountry)
# 7min 34s

CPU times: user 2.6 s, sys: 280 ms, total: 2.88 s
Wall time: 7min 34s


In [22]:
%%time 
df3['country'] = df3['location'].apply(findCountry)
# 10min 55s

CPU times: user 3.82 s, sys: 408 ms, total: 4.23 s
Wall time: 10min 55s


In [99]:
%%time
df4['country'] = df4['location'].apply(findCountry)
# 11min 42s

CPU times: user 3.42 s, sys: 356 ms, total: 3.78 s
Wall time: 10min 44s


In [100]:
%%time
df5['country'] = df5['location'].apply(findCountry)
# 11min 5s

CPU times: user 3.63 s, sys: 367 ms, total: 4 s
Wall time: 11min 1s


In [101]:
%%time
df6['country'] = df6['location'].apply(findCountry)
# 10min 3s

CPU times: user 3.28 s, sys: 329 ms, total: 3.61 s
Wall time: 9min 45s


In [102]:
%%time
df7['country'] = df7['location'].apply(findCountry)
# 11min 50s

CPU times: user 3.62 s, sys: 368 ms, total: 3.99 s
Wall time: 10min 54s


In [103]:
%%time
df8['country'] = df8['location'].apply(findCountry)
# 12min 9s

CPU times: user 3.77 s, sys: 376 ms, total: 4.14 s
Wall time: 11min 23s


In [104]:
%%time
df9['country'] = df9['location'].apply(findCountry)
# 11min 1s

CPU times: user 3.81 s, sys: 380 ms, total: 4.19 s
Wall time: 11min 13s


In [105]:
%%time
df10['country'] = df10['location'].apply(findCountry)
# 16min 12s - there's more rows 

CPU times: user 5.62 s, sys: 551 ms, total: 6.17 s
Wall time: 16min 12s


In [32]:
# it runs faster when there are more NA values

In [106]:
data = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10])
data[['country']].head()

Unnamed: 0,country
0,United States
1,United States
2,United States
3,United States
4,United States


In [157]:
data[data.location.isna()].shape

(3905, 92)

In [117]:
pd.set_option('display.max_rows', None)

# issues
1. 'Richmond, VA' classified as canada
2. 'the ghetto' classifed as italy
3. 

### Date and Time Transformation

In [124]:
data[["created_at"]] = data[["created_at"]].apply(pd.to_datetime)
data['day_post_created'] = pd.DatetimeIndex(data["created_at"]).day
data['year_account_created'] = pd.DatetimeIndex(data["account_created_at"]).year
data['year_account_created'] = data["year_account_created"].astype("category")

In [125]:
data[['created_at', 'day_post_created', 'year_account_created']].head()

Unnamed: 0,created_at,day_post_created,year_account_created
0,2021-06-14 17:20:28,14,2010
1,2021-06-12 08:47:43,12,2010
2,2021-06-12 16:16:03,12,2010
3,2021-06-13 10:24:28,13,2010
4,2021-06-13 12:28:37,13,2010


### Saving Data Frame

In [None]:
data_processed = data.copy()

In [154]:
data_processed = data.copy()
data_processed.to_csv('cybersec_processed.csv', index = True)

In [143]:
data_processed.shape

(17656, 92)

##  Vectorizing Data

Vectorizing using Binary Term Frequency ([link](https://towardsdatascience.com/getting-started-with-text-vectorization-2f2efbec6685)). Binary Term Frequency captures presence (1) or absence (0) of term in document. For this part, under TfidfVectorizer, we set binary parameter equal to true so that it can show just presence (1) or absence (0) and norm parameter equal to false.



In [128]:
tv = TfidfVectorizer(
binary = True, norm = None,
use_idf = False, smooth_idf = False,
lowercase = True, stop_words = None,
min_df = 1, max_df = 1.0, max_features = None, ngram_range = (1,1))

### Text Data Vectorization

In [129]:
text_lst = []
for row in data['text']:
    text_lst.append(row)

text_lst[:5] # need to make into a list so the vectorizer will work

['sign cause solution network data breach \xa0cyberattack',
 'battling chinese big tech encroachment india \xa0cyberattack',
 'ask dr jeanette success way devil choke chain gonna quick gonna sure gonna final los angeles sentinel los angeles sentinel \xa0cyberattack',
 'threat digital pakistan \xa0cyberattack',
 'pakistan attack news pakistan army claim major attack indian intel world news \xa0cyberattack']

In [130]:
text_vec = pd.DataFrame(tv.fit_transform(text_lst).toarray(), columns = tv.get_feature_names())
text_vec.head()

Unnamed: 0,000,002002,0027,003,004,004002,006,00b328bc1cf5221fc6efb560c6bb764922c31e002179799da67f1f68a503ca41,00bytes,01,...,zta,ztna,zuckerbergstyle,zum,zumbullaw,zurich,zwatest,zwijberg,zyxel,zyz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
text_vec.to_csv('cybersec_text_vectorized.csv') 

### Description Data Vectorization

In [137]:
desc_lst = []
for row in data['description']:
    desc_lst.append(row)

desc_lst[:5] # need to make into a list so the vectorizer will work

['leader news hacker hacking techjobs nationalcybersecurity download app national 50',
 'leader news hacker hacking techjobs nationalcybersecurity download app national 50',
 'leader news hacker hacking techjobs nationalcybersecurity download app national 50',
 'leader news hacker hacking techjobs nationalcybersecurity download app national 50',
 'leader news hacker hacking techjobs nationalcybersecurity download app national 50']

In [138]:
desc_vec = pd.DataFrame(tv.fit_transform(desc_lst).toarray(), columns = tv.get_feature_names())
desc_vec.head()

Unnamed: 0,001,0099,01,01100101,01100111,01101000,01101111,01110010,01110100,0121,...,àrea,área,économique,énfasis,ético,ïgh,ïñ,ñuñoando,österreich,þlåïñ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [139]:
desc_vec.to_csv('cybersec_desc_vectorized.csv') 