# Cyber Security Tweets Preprocessing

In [4]:
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
sns.set(rc={'figure.figsize':(6,8)}) 

import warnings
warnings.simplefilter("ignore")

%matplotlib inline 

from sklearn.preprocessing import LabelEncoder 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.metrics import SCORERS
import re

import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.probability import FreqDist

## Functions

In [5]:
def cleanTxt(txt):
    if not isinstance(txt, str):
        txt = str(txt)
    txt = re.sub(r'[^\w\s]','', txt) #remove everything except words and space
    txt = txt.lower()
    txt = re.sub(r'RT[\s]+', '', txt) #removing RT
    txt = re.sub(r'_','', txt) #how to remove underscore as well
    if 'https' in txt: # removing hyperlinks 
        pos = txt.find('https')
        txt = txt[:pos]
    return txt

In [6]:
wnet = WordNetLemmatizer()

def stopWords(txt): 
    txt_lst = txt.split(" ")
    txt_lst = [word for word in txt_lst if word not in stopwords.words('english')]
    txt_lst = [wnet.lemmatize(word) for word in txt_lst]
    txt_lst = [word for word in txt_lst if word != '']
    txt_lst = [word for word in txt_lst if word != '\r\n\r\n'] # indicates space and indexing
    txt_lst = [word for word in txt_lst if word != 'u0001f449'] # indicates back hand emoji pointing right
    txt_lst = [word for word in txt_lst if word != 'cybersecurity']
    txt_lst = [word for word in txt_lst if word != 'cyber']
    txt_lst = [word for word in txt_lst if word != 'security']
    return " ".join(txt_lst)

In [52]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
geolocator = Nominatim(user_agent = "geoapiExercises")
geocode = RateLimiter(geolocator.geocode, max_retries=10000, swallow_exceptions=True, return_value_on_exception=None, min_delay_seconds=1)
def findCountry(txt):
    if txt is np.nan:
        return 'Unknown'
    if txt.lower() == 'global':
        return 'Global'
    try:
        x = geolocator.geocode(txt, )
        if x is None:
            return 'Unknown'
        loc = x.address.split(',')[-1]
        if loc[0] == " ":
            loc = loc[1:]
        return loc
    except:
        return 'Unknown'

In [8]:
def splitData(data, len_of_frame):
    n = len(data.index) 
    split_n = round(n/len_of_frame, 0)
    print(split_n)
    splits = [int(x) for x in range(int(split_n))]
    lst = []
    for splt in splits:
        if splt == splits[-1]:
            lst.append(data[int(splt * len_of_frame):])
        else:
            lst.append(data[int(splt * len_of_frame):int(len_of_frame * (1 + splt))])
    return lst

## Data

In [9]:
data = pd.read_csv('cybersecurity_tweets.csv')

In [10]:
data = data.drop_duplicates('text')

In [11]:
data.shape

(17656, 90)

In [12]:
data.head()

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url
0,x109082290,x1404488925935177737,2021-06-14 17:20:28,NcsVentures,"Signs, Causes and Solutions for Network Data B...",WordPress.com,109,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...
1,x109082290,x1403635112613494786,2021-06-12 08:47:43,NcsVentures,Battling Chinese Big Tech encroachment in Indi...,WordPress.com,103,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...
2,x109082290,x1403747937818120198,2021-06-12 16:16:03,NcsVentures,Ask Dr. Jeanette® Success On “The Way” Devil O...,WordPress.com,226,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...
3,x109082290,x1404021845913899009,2021-06-13 10:24:28,NcsVentures,Cyber threats to Digital Pakistan | #cybersecu...,WordPress.com,89,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...
4,x109082290,x1404053089804554241,2021-06-13 12:28:37,NcsVentures,Pakistan Cyber Attack News: Pakistan army clai...,WordPress.com,152,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...


In [13]:
len(data.user_id.unique()) # number of unique individuals

7635

## Data Cleaning

### Text Data

In [16]:
%%time 
data['text'] = data['text'].apply(cleanTxt)
data['text'] = data['text'].apply(stopWords)
data[['text']].head()
# 46.2s

CPU times: user 34.6 s, sys: 8.16 s, total: 42.7 s
Wall time: 42.8 s


Unnamed: 0,text
0,sign cause solution network data breach cyber...
1,battling chinese big tech encroachment india ...
2,ask dr jeanette success way devil choke chain ...
3,threat digital pakistan cyberattack
4,pakistan attack news pakistan army claim major...


### Description Data

In [17]:
%%time
data['description'] = data['description'].apply(cleanTxt)
data['description'] = data['description'].apply(stopWords)
data[['description']].head()
 # 32.4s

CPU times: user 23.7 s, sys: 5.67 s, total: 29.3 s
Wall time: 29.4 s


Unnamed: 0,description
0,leader news hacker hacking techjobs nationalcy...
1,leader news hacker hacking techjobs nationalcy...
2,leader news hacker hacking techjobs nationalcy...
3,leader news hacker hacking techjobs nationalcy...
4,leader news hacker hacking techjobs nationalcy...


### Location to Country

In [18]:
# I made a function to split the data into smaller dataframes
# It isn't going to run faster, but at least we can run it in
# chunks

In [33]:
df1 = splitData(data, 1700)[0]
df2 = splitData(data, 1700)[1]
df3 = splitData(data, 1700)[2]
df4 = splitData(data, 1700)[3]
df5 = splitData(data, 1700)[4]
df6 = splitData(data, 1700)[5]
df7 = splitData(data, 1700)[6]
df8 = splitData(data, 1700)[7]
df9 = splitData(data, 1700)[8]
df10 = splitData(data, 1700)[9]
# the numbers just indicates how many data frames there are
# it gets printed everything the function is run

10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0
10.0


In [34]:
df4[['location']].iloc[25:]

Unnamed: 0,location
5296,Montréal (Québec)
5297,Montréal (Québec)
5298,"Arlington, VA"
5299,"Arlington, VA"
5300,127.0.0.1
...,...
7001,"England, United Kingdom"
7002,"Gulf Breeze, Florida"
7003,"Gulf Breeze, Florida"
7004,"Gulf Breeze, Florida"


In [35]:
df6.shape

(1700, 90)

In [54]:
%%time 
df1['country'] = df1['location'].apply(findCountry)
# 13min 6s

CPU times: user 1.65 s, sys: 517 ms, total: 2.16 s
Wall time: 3.05 s


In [55]:
df1[['country']]

Unnamed: 0,country
0,Unknown
1,Unknown
2,Unknown
3,Unknown
4,Unknown
...,...
1695,Unknown
1696,Unknown
1697,Unknown
1698,Unknown


In [53]:
%%time 
df2['country'] = df2['location'].apply(findCountry)
# 7min 35s

CPU times: user 927 ms, sys: 297 ms, total: 1.22 s
Wall time: 1.72 s


In [38]:
%%time 
df3['country'] = df3['location'].apply(findCountry)
# 10min 45s

CPU times: user 1.47 s, sys: 455 ms, total: 1.92 s
Wall time: 2.76 s


In [39]:
%%time
df4['country'] = df4['location'].apply(findCountry)
# 11min 41s

CPU times: user 1.41 s, sys: 447 ms, total: 1.86 s
Wall time: 2.62 s


In [40]:
%%time
df5['country'] = df5['location'].apply(findCountry)
# 11min 5s

CPU times: user 1.44 s, sys: 430 ms, total: 1.87 s
Wall time: 2.65 s


In [41]:
%%time
df6['country'] = df6['location'].apply(findCountry)
# 10min 2s

CPU times: user 1.21 s, sys: 389 ms, total: 1.6 s
Wall time: 2.24 s


In [42]:
%%time
df7['country'] = df7['location'].apply(findCountry)
# 11min 52s

CPU times: user 1.37 s, sys: 433 ms, total: 1.8 s
Wall time: 2.53 s


In [43]:
%%time
df8['country'] = df8['location'].apply(findCountry)
# 12min 9s

CPU times: user 1.39 s, sys: 434 ms, total: 1.82 s
Wall time: 2.57 s


In [44]:
%%time
df9['country'] = df9['location'].apply(findCountry)

CPU times: user 1.34 s, sys: 442 ms, total: 1.78 s
Wall time: 2.48 s


In [45]:
%%time
df10['country'] = df10['location'].apply(findCountry)

CPU times: user 1.92 s, sys: 612 ms, total: 2.53 s
Wall time: 3.55 s


In [46]:
df10[['country']].tail()

Unnamed: 0,country
17949,Unknown
17950,Unknown
17951,Unknown
17952,Unknown
17953,Unknown


### Date and Time Transformation

In [None]:
data[["created_at"]] = data[["created_at"]].apply(pd.to_datetime)
data['day_post_created'] = pd.DatetimeIndex(ident["created_at"]).day
data['year_account_created'] = pd.DatetimeIndex(ident["account_created_at"]).year
data['year_account_created'] = ident["year_account_created"].astype("category")

In [None]:
data[['created_at', 'day_post_created', 'year_account_created']].head()

### Saving Data Frame

In [None]:
data.to_csv('')

##  Vectorizing Data

Vectorizing using Binary Term Frequency ([link](https://towardsdatascience.com/getting-started-with-text-vectorization-2f2efbec6685)). Binary Term Frequency captures presence (1) or absence (0) of term in document. For this part, under TfidfVectorizer, we set binary parameter equal to true so that it can show just presence (1) or absence (0) and norm parameter equal to false.



In [None]:
tv = TfidfVectorizer(
binary = True, norm = None,
use_idf = False, smooth_idf = False,
lowercase = True, stop_words = None,
min_df = 1, max_df = 1.0, max_features = None, ngram_range = (1,1))

### Text Data Vectorization

In [127]:
text_lst = []
for row in data['text']:
    text_lst.append(row)

text_lst[:5] # need to make into a list so the vectorizer will work

['sign cause solution network data breach \xa0cyberattack',
 'battling chinese big tech encroachment india \xa0cyberattack',
 'ask dr jeanette success way devil choke chain gonna quick gonna sure gonna final los angeles sentinel los angeles sentinel \xa0cyberattack',
 'threat digital pakistan \xa0cyberattack',
 'pakistan attack news pakistan army claim major attack indian intel world news \xa0cyberattack']

In [129]:
text_vec = pd.DataFrame(tv.fit_transform(text_lst).toarray(), columns = tv.get_feature_names())
text_vec.head()

Unnamed: 0,000,002002,0027,003,004,004002,006,00b328bc1cf5221fc6efb560c6bb764922c31e002179799da67f1f68a503ca41,00bytes,01,...,zta,ztna,zuckerbergstyle,zum,zumbullaw,zurich,zwatest,zwijberg,zyxel,zyz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
text_vec.to_csv('') # i forgot how to do this lol`

### Description Data Vectorization

In [None]:
desc_lst = []
for row in data['description']:
    desc_lst.append(row)

desc_lst[:5] # need to make into a list so the vectorizer will work

In [None]:
desc_vec = pd.DataFrame(tv.fit_transform(desc_lst).toarray(), columns = tv.get_feature_names())
desc_vec.head()

In [None]:
desc_vec.to_csv('') 