In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import collections

from nltk.tokenize import word_tokenize
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
#nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk.probability import FreqDist
nltk.download('averaged_perceptron_tagger')

from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics

%matplotlib inline

[nltk_data] Downloading package wordnet to c:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to c:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     c:\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
dataset = pd.read_csv('CyberCrimeArticles-3.csv')
dataset.head()

Unnamed: 0,Image_Description,Link,Article_Title,Article_URL,Article_Text
0,Rapid delivery's rapid rise has led to tension...,https://static.euronews.com/articles/stories/0...,Dutch cities temporarily banned 10-minute deli...,https://www.euronews.com/next/2022/02/24/dutch...,"['Noise, reckless cycling and blacked-out wind..."
1,Start-up conference Step in Dubai Internet City.,https://static.euronews.com/articles/stories/0...,How Dubai Internet City is becoming a hub for ...,https://www.euronews.com/next/2022/02/23/how-d...,"['Khazna, one of the largest data centre infra..."
2,Meta CEO Mark Zuckerberg said the company's co...,https://static.euronews.com/articles/stories/0...,"Ahoy, Metamates! Meta continues rebranding eff...",https://www.euronews.com/next/2022/02/16/ahoy-...,"[""Employees at Facebook's parent company Meta ..."
3,Carolyn Kaster/AP Photo,https://static.euronews.com/articles/stories/0...,"Ahoy, Metamates! Meta continues rebranding eff...",https://www.euronews.com/next/2022/02/16/ahoy-...,"[""Employees at Facebook's parent company Meta ..."
4,Choosing a green pension could be better for t...,https://static.euronews.com/articles/stories/0...,Green pensions could be the ‘most powerful wea...,https://www.euronews.com/green/2022/02/15/gree...,['Switching to a sustainable pension could be ...


In [5]:
dataset["Article_URL"][0]

'https://www.euronews.com/next/2022/02/24/dutch-cities-temporarily-banned-10-minute-delivery-dark-stores-then-one-company-threatened'

In [6]:
dataset["Article_Text"][0]

'[\'Noise, reckless cycling and blacked-out windows are some of the reasons rapid delivery services like Gorillas, Getir, Flink and Zapp have clashed with residents in cities across the Netherlands.\', \'Now, one rapid delivery service has accused Dutch city officials of breaking EU law after they imposed a year-long ban on new "dark stores" - the virtual supermarket warehouses that allow high-speed delivery of groceries and other household goods.\', \'Last month, city councillors in Amsterdam and Rotterdam voted to freeze development of dark stores following complaints prompted by the speed of growth of the rapid delivery services during the pandemic.\', \'Now, other Dutch cities are set to follow their example, with moves to restrict the delivery apps underway in 23 other municipalities, according to the Dutch newspaper NRC.\', \'Berlin-based Flink told Euronews Next it was "evaluating all options, including legal action" in response to the freeze.\', \'Amsterdam and Rotterdam\\\'s "

In [7]:
def lemmatizeList( li ):
    lemmatizer = WordNetLemmatizer()
    res = []
    for theWord, theTag in nltk.pos_tag(li):
        if theTag.startswith("VB"):
            res.append( lemmatizer.lemmatize(theWord, pos = 'v') )
        elif theTag.startswith("NN"):
            res.append( lemmatizer.lemmatize(theWord, pos = 'n') )
        elif theTag.startswith("RB"):
            res.append( lemmatizer.lemmatize(theWord, pos = 'r') )
        elif theTag.startswith("JJ"):
            res.append( lemmatizer.lemmatize(theWord, pos = 'a') )
    return res
        

In [33]:
def getNouns( sent ): 
    sent = [word for word in word_tokenize(sent.lower())]
    res = []
    for theWord, theTag in nltk.pos_tag(sent):
        if theTag.startswith("NN"):
            res.append(theWord)
    return " ".join(res)

In [45]:
stop_words = set(stopwords.words('english'))
sent = dataset.loc[0,'Article_Text']

# Use a regex to catch all single letter words, html tags or other non-alphanumeric characters then substitute them with a space
cleaned = re.sub(r"(?:(?:^|\s+)[a-zA-Z]\s+)|<.*?>|\W", ' ', sent)

# Tokenize the sentence using word_tokenize method
tokenized_sent = [word for word in word_tokenize(cleaned.lower())]

# lemmatize sentence then append to the list containing lemmatized reviews without punctuation 
lemmatized_words = lemmatizeList( tokenized_sent )

# remove stopwords then append to the list containing lemmatized reviews without punctuation or stopwords
lemmaNoStop = [word for word in lemmatized_words if word not in stop_words]
tem = [" ".join(lemmaNoStop) ]
#tem[0] = getNouns(tem[0])
print(tem)

['noise reckless cycling black window reason rapid delivery service gorilla getir flink zapp clash resident city netherlands rapid delivery service accuse dutch city official break eu law impose year long ban new dark store virtual supermarket warehouse allow high speed delivery grocery household good last month city councillor amsterdam rotterdam vote freeze development dark store follow complaint prompt speed growth rapid delivery service pandemic dutch city set follow example move restrict delivery apps underway municipality accord dutch newspaper nrc berlin base flink tell euronews evaluate option include legal action response freeze amsterdam rotterdam restriction freedom service meet condition non discrimination necessity proportionality opinion company say year ago rapid delivery service netherlands german startup gorilla enter dutch market december follow competitor getir flink zapp company compete deliver food item customer minute balloon number rapid delivery customer early a

In [46]:
tfidf_vectorizer = TfidfVectorizer(min_df = 0, max_df = 1, ngram_range=(1,3))

vect = tfidf_vectorizer.fit_transform(tem)

In [47]:
feature_array = np.array(tfidf_vectorizer.get_feature_names())
feature_array

array(['able', 'able run', 'able run pop', ..., 'zone plan regulate',
       'zone retail', 'zone retail us'], dtype='<U40')

In [48]:
tfidf_sorting = np.argsort(vect.toarray()).flatten()[::-1]

n = 15
top_n = feature_array[tfidf_sorting][:n]
top_n

array(['store', 'dark', 'dark store', 'say', 'delivery', 'rotterdam',
       'freeze', 'rapid', 'rapid delivery', 'new', 'municipality', 'shop',
       'service', 'company', 'city'], dtype='<U40')

In [50]:
getNouns(" ".join(top_n))

'store dark store delivery rotterdam freeze delivery municipality shop service company city'

In [53]:
topRel = []

# initializing a set containing english stopwords
stop_words = set(stopwords.words('english'))


for i in range(0, len(dataset['Article_Text'])):
    sent = dataset.loc[i,'Article_Text']
    
    # Use a regex to catch all single letter words, html tags or other non-alphanumeric characters then substitute them with a space
    cleaned = re.sub(r"(?:(?:^|\s+)[a-zA-Z]\s+)|<.*?>|\W", ' ', sent)
    
    # Tokenize the sentence using word_tokenize method
    tokenized_sent = [word for word in word_tokenize(cleaned.lower())]

    # lemmatize sentence then append to the list containing lemmatized reviews without punctuation 
    lemmatized_words = lemmatizeList( tokenized_sent )

    # remove stopwords then append to the list containing lemmatized reviews without punctuation or stopwords
    lemmaNoStop = [word for word in lemmatized_words if word not in stop_words]
    tem = [ " ".join(lemmaNoStop)]
    #cleanedData.append( tem[0])
    tfidf_vectorizer = TfidfVectorizer(min_df = 0, max_df = 1, ngram_range=(1,3))
    if(len(tem[0].strip())):
        vect = tfidf_vectorizer.fit_transform(tem)
        feature_array = np.array(tfidf_vectorizer.get_feature_names())
        tfidf_sorting = np.argsort(vect.toarray()).flatten()[::-1]

        n = 15
        top_n = feature_array[tfidf_sorting][:n]
        top_n = [w for w in word_tokenize(getNouns(" ".join(top_n)))]
        top_n = set(top_n)
        topRel.append(list(top_n))
    else:
        topRel.append("N/A")

In [54]:
topRel

[['company',
  'shop',
  'freeze',
  'rotterdam',
  'dark',
  'store',
  'delivery',
  'municipality',
  'service',
  'city'],
 ['khazna',
  'centre',
  'internet',
  'data',
  'increase',
  'dubai',
  'support',
  'strategy',
  'city'],
 ['employee',
  'company',
  'privacy',
  'use',
  'meta',
  'tuesday',
  'facebook',
  'zuckerberg',
  'lawsuit',
  'value'],
 ['employee',
  'company',
  'privacy',
  'use',
  'meta',
  'tuesday',
  'facebook',
  'zuckerberg',
  'lawsuit',
  'value'],
 ['company', 'invest', 'use', 'matter', 'pension', 'money', 'study', 'carbon'],
 ['company',
  'shop',
  'freeze',
  'rotterdam',
  'dark',
  'store',
  'delivery',
  'municipality',
  'service',
  'city'],
 ['ikea', 'release', 'streetwear', 'post', 'share', 'greggs'],
 ['ikea', 'release', 'streetwear', 'post', 'share', 'greggs'],
 ['woman',
  'entrepreneur',
  'founder',
  'fashion',
  'male',
  'business',
  'support',
  'year'],
 ['woman',
  'entrepreneur',
  'founder',
  'fashion',
  'male',
  'busi

In [60]:
import csv

with open('CyberCrimeArticles-3.csv','r',encoding='utf-8') as csvinput:
    with open('week5.csv', 'w',encoding='utf-8') as csvoutput:
        writer = csv.writer(csvoutput, lineterminator='\n')
        reader = csv.reader(csvinput)

        everything = []
        row = next(reader)
        row.append('Keywords')
        everything.append(row)

        for row, el in zip(reader, topRel):
            row.append(el)
            everything.append(row)

        writer.writerows(everything)
