# Cyber Security Tweets Preprocessing

In [1]:
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
sns.set(rc={'figure.figsize':(6,8)}) 

import warnings
warnings.simplefilter("ignore")

%matplotlib inline 

from sklearn.preprocessing import LabelEncoder 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

from sklearn.metrics import SCORERS
import re

import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.probability import FreqDist

## Functions

In [43]:
def cleanTxt(txt):
    if not isinstance(txt, str):
        txt = str(txt)
    txt = re.sub(r'[^\w\s]','', txt) #remove everything except words and space
    txt = txt.lower()
    txt = re.sub(r'RT[\s]+', '', txt) #removing RT
    txt = re.sub(r'_','', txt) #how to remove underscore as well
    if 'https' in txt: # removing hyperlinks 
        pos = txt.find('https')
        txt = txt[:pos]
    return txt

In [3]:
wnet = WordNetLemmatizer()

def stopWords(txt): 
    txt_lst = txt.split(" ")
    txt_lst = [word for word in txt_lst if word not in stopwords.words('english')]
    txt_lst = [wnet.lemmatize(word) for word in txt_lst]
    txt_lst = [word for word in txt_lst if word != '']
    txt_lst = [word for word in txt_lst if word != '\r\n\r\n'] # indicates space and indexing
    txt_lst = [word for word in txt_lst if word != 'u0001f449'] # indicates back hand emoji pointing right
    txt_lst = [word for word in txt_lst if word != 'cybersecurity']
    txt_lst = [word for word in txt_lst if word != 'cyber']
    txt_lst = [word for word in txt_lst if word != 'security']
    return " ".join(txt_lst)

In [4]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "geoapiExercises")
def findCountry(txt):
    if txt is np.nan:
        return 'Unknown'
    x = geolocator.geocode(txt)
    if x is None:
        return 'Unknown'
    loc = x.address.split(',')[-1]
    if loc[0] == " ":
        loc = loc[1:]
    return loc

In [5]:
def splitData(data, len_of_frame):
    n = len(data.index) 
    split_n = round(n/len_of_frame, 0)
    print(split_n)
    splits = [int(x) for x in range(int(split_n))]
    lst = []
    for splt in splits:
        if splt == splits[-1]:
            lst.append(data[int(splt * len_of_frame):])
        else:
            lst.append(data[int(splt * len_of_frame):int(len_of_frame * (1 + splt))])
    return lst

## Data

In [21]:
data = pd.read_csv('cybersecurity_tweets.csv')

In [7]:
data = data.drop_duplicates('text')

In [8]:
data.shape

(17656, 90)

In [9]:
data.head()

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url
0,x109082290,x1404488925935177737,2021-06-14 17:20:28,NcsVentures,"Signs, Causes and Solutions for Network Data B...",WordPress.com,109,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...
1,x109082290,x1403635112613494786,2021-06-12 08:47:43,NcsVentures,Battling Chinese Big Tech encroachment in Indi...,WordPress.com,103,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...
2,x109082290,x1403747937818120198,2021-06-12 16:16:03,NcsVentures,Ask Dr. Jeanette® Success On “The Way” Devil O...,WordPress.com,226,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...
3,x109082290,x1404021845913899009,2021-06-13 10:24:28,NcsVentures,Cyber threats to Digital Pakistan | #cybersecu...,WordPress.com,89,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...
4,x109082290,x1404053089804554241,2021-06-13 12:28:37,NcsVentures,Pakistan Cyber Attack News: Pakistan army clai...,WordPress.com,152,,,,...,260725,0,2010-01-27 22:48:24,False,https://t.co/E0XpJWVD52,http://www.nationalcybersecurity.com/,,https://pbs.twimg.com/profile_banners/10908229...,http://abs.twimg.com/images/themes/theme15/bg.png,http://pbs.twimg.com/profile_images/715854922/...


In [181]:
len(data.user_id.unique()) # number of unique individuals

7635

## Data Cleaning

### Text Data

In [45]:
%%time
data['text'] = data['text'].apply(cleanTxt)
data['text'] = data['text'].apply(stopWords)
data[['text']].head()

CPU times: user 35.7 s, sys: 8.74 s, total: 44.4 s
Wall time: 44.7 s


Unnamed: 0,text
0,sign cause solution network data breach cyber...
1,battling chinese big tech encroachment india ...
2,ask dr jeanette success way devil choke chain ...
3,threat digital pakistan cyberattack
4,pakistan attack news pakistan army claim major...


### Description Data

In [46]:
%%time
data['description'] = data['description'].apply(cleanTxt)
data['description'] = data['description'].apply(stopWords)
data[['description']].head()

CPU times: user 17.4 s, sys: 4.2 s, total: 21.6 s
Wall time: 21.7 s


Unnamed: 0,description
0,leader news hacker hacking techjobs nationalcy...
1,leader news hacker hacking techjobs nationalcy...
2,leader news hacker hacking techjobs nationalcy...
3,leader news hacker hacking techjobs nationalcy...
4,leader news hacker hacking techjobs nationalcy...


### Location to Country

In [None]:
# I made a function to split the data into smaller dataframes
# It isn't going to run faster, but at least we can run it in
# chunks

In [48]:
df1 = splitData(data, 3000)[0]
df2 = splitData(data, 3000)[1]
df3 = splitData(data, 3000)[2]
df4 = splitData(data, 3000)[3]
df5 = splitData(data, 3000)[4]
df6 = splitData(data, 3000)[5]
# the numbers just indicates how many data frames there are
# it gets printed everything the function is run

6.0
6.0
6.0
6.0
6.0
6.0


In [51]:
df3.head()

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url
6000,x3429955204,x1404362978254680064,2021-06-14 09:00:00,DrJDrooghaag,u0001f535 internet safety kid episode 1 update...,TweetDeck,206,,,,...,140586,272210,2015-08-18 13:57:19,False,https://t.co/QAZFyYjvdR,https://johannesdrooghaag.com,,https://pbs.twimg.com/profile_banners/34299552...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1381516661...
6001,x3429955204,x1404053437545795584,2021-06-13 12:30:00,DrJDrooghaag,u0001f535 let u conversation opportunity chall...,TweetDeck,201,,,,...,140586,272210,2015-08-18 13:57:19,False,https://t.co/QAZFyYjvdR,https://johannesdrooghaag.com,,https://pbs.twimg.com/profile_banners/34299552...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1381516661...
6002,x3429955204,x1404378401625055234,2021-06-14 10:01:17,DrJDrooghaag,dcallahan2 billmew guardian dezblanchfield rob...,Twitter Web App,104,x1404378115430825984,x304975520,dcallahan2,...,140586,272210,2015-08-18 13:57:19,False,https://t.co/QAZFyYjvdR,https://johannesdrooghaag.com,,https://pbs.twimg.com/profile_banners/34299552...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1381516661...
6003,x3429955204,x1404224215792955400,2021-06-13 23:48:36,DrJDrooghaag,awareness month tip 26 \r\nby drjdrooghaag,Revive Social App,127,,,,...,140586,272210,2015-08-18 13:57:19,False,https://t.co/QAZFyYjvdR,https://johannesdrooghaag.com,,https://pbs.twimg.com/profile_banners/34299552...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1381516661...
6004,x3429955204,x1404060993957761025,2021-06-13 13:00:01,DrJDrooghaag,u0001f535 soon crucial usp business b2b b2c ma...,TweetDeck,214,,,,...,140586,272210,2015-08-18 13:57:19,False,https://t.co/QAZFyYjvdR,https://johannesdrooghaag.com,,https://pbs.twimg.com/profile_banners/34299552...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1381516661...


In [52]:
df3.shape

(3000, 90)

In [62]:
%%time
df1['country'] = df1['location'].apply(findCountry)

CPU times: user 1.87 ms, sys: 2.41 ms, total: 4.28 ms
Wall time: 4.12 ms


Unnamed: 0,country
0,United States
1,United States
2,United States
3,United States
4,United States
...,...
2995,United States
2996,United States
2997,United States
2998,United States


In [68]:
%%time
df2['country'] = df2['location'].apply(findCountry)

GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=127.0.0.1&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))

In [64]:
%%time
#df3['country'] = df3['location'].apply(findCountry)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


In [65]:
%%time
#df4['country'] = df4['location'].apply(findCountry)

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 7.87 µs


In [66]:
%%time
#df5['country'] = df5['location'].apply(findCountry)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.96 µs


In [67]:
%%time
#df6['country'] = df6['location'].apply(findCountry)

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 7.87 µs


### Date and Time Transformation

In [None]:
data[["created_at"]] = data[["created_at"]].apply(pd.to_datetime)
data['day_post_created'] = pd.DatetimeIndex(ident["created_at"]).day
data['year_account_created'] = pd.DatetimeIndex(ident["account_created_at"]).year
data['year_account_created'] = ident["year_account_created"].astype("category")

In [None]:
data[['created_at', 'day_post_created', 'year_account_created']].head()

### Saving Data Frame

In [None]:
data.to_csv('')

##  Vectorizing Data

Vectorizing using Binary Term Frequency ([link](https://towardsdatascience.com/getting-started-with-text-vectorization-2f2efbec6685)). Binary Term Frequency captures presence (1) or absence (0) of term in document. For this part, under TfidfVectorizer, we set binary parameter equal to true so that it can show just presence (1) or absence (0) and norm parameter equal to false.



In [None]:
tv = TfidfVectorizer(
binary = True, norm = None,
use_idf = False, smooth_idf = False,
lowercase = True, stop_words = None,
min_df = 1, max_df = 1.0, max_features = None, ngram_range = (1,1))

### Text Data Vectorization

In [127]:
text_lst = []
for row in data['text']:
    text_lst.append(row)

text_lst[:5] # need to make into a list so the vectorizer will work

['sign cause solution network data breach \xa0cyberattack',
 'battling chinese big tech encroachment india \xa0cyberattack',
 'ask dr jeanette success way devil choke chain gonna quick gonna sure gonna final los angeles sentinel los angeles sentinel \xa0cyberattack',
 'threat digital pakistan \xa0cyberattack',
 'pakistan attack news pakistan army claim major attack indian intel world news \xa0cyberattack']

In [129]:
text_vec = pd.DataFrame(tv.fit_transform(text_lst).toarray(), columns = tv.get_feature_names())
text_vec.head()

Unnamed: 0,000,002002,0027,003,004,004002,006,00b328bc1cf5221fc6efb560c6bb764922c31e002179799da67f1f68a503ca41,00bytes,01,...,zta,ztna,zuckerbergstyle,zum,zumbullaw,zurich,zwatest,zwijberg,zyxel,zyz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
text_vec.to_csv('') # i forgot how to do this lol`

### Description Data Vectorization

In [None]:
desc_lst = []
for row in data['description']:
    desc_lst.append(row)

desc_lst[:5] # need to make into a list so the vectorizer will work

In [None]:
desc_vec = pd.DataFrame(tv.fit_transform(desc_lst).toarray(), columns = tv.get_feature_names())
desc_vec.head()

In [None]:
desc_vec.to_csv('') 