# NLP Project

In our last exploring NLP notebook we built an email spam detector using Natural Language Processing techniques and the Support Vector Machine (SVM) algorithm for classification.
In this project, we will again build a spam detector but this time using URLs instead of emails.

In [None]:
!pip install pandas
!pip install nltk
!pip install sklearn

In [8]:
import pandas as pd
import numpy as np
import nltk #text processing
import re
import unicodedata
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /home/gitpod/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
df_raw = pd.read_csv(url)

In [10]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [11]:
df_raw.sample(10)

Unnamed: 0,url,is_spam
1775,https://www.theskimm.com/,True
2877,https://skeptics.stackexchange.com/questions/2...,False
2936,https://www.coronavirus.gov/,True
2987,https://thehustle.co/07012020-Corporate-swag/,False
67,https://www.theguardian.com/film/2020/jun/19/w...,False
333,https://www.morningbrew.com/retail/,True
1524,https://www.youtube.com/watch?v=Yz3mQhuMACs&fe...,False
243,https://www.politico.com/news/2020/06/21/takea...,False
2546,https://playbill.com/article/broadway-will-off...,False
1371,https://www.morningbrew.com/emerging-tech/,True


In [13]:
df_raw['is_spam'].value_counts()


False    2303
True      696
Name: is_spam, dtype: int64

In [15]:
df = df_raw.copy()

## PREPROCESS

In [37]:
domain= df['url'].apply(lambda x :re.findall('://w?w?w?\.?([\w\-\.]+)', x)[0])
proto = df['url'].apply(lambda x :re.findall('(\w+)://', x)[0])

In [38]:
domain.value_counts()

morningbrew.com               194
nytimes.com                   112
youtube.com                    98
reuters.com                    67
cnn.com                        66
                             ... 
hirshhorn.si.edu                1
bonappetit.com                  1
framestrategy.co                1
framestrategy.substack.com      1
smartcitiesworld.net            1
Name: url, Length: 764, dtype: int64

In [39]:
proto.value_counts()
#only two value so, use a binary column

https    2945
http       54
Name: url, dtype: int64

In [40]:
df['len_url'] = df['url'].apply(lambda x : len(x))
df['contains_subscribe'] = df['url'].apply(lambda x : 1 if "subscribe" in x else 0)
df['contains_hash'] = df['url'].apply(lambda x : 1 if "#" in x else 0)
df['num_digits'] = df['url'].apply(lambda x : len("".join(_ for _ in x if _.isdigit())) )
df['non_https'] = df['url'].apply(lambda x : 1 if "https" in x else 0) 
df['num_words'] = df['url'].apply(lambda x : len(x.split("/")))
df['domain'] = df['url'].apply(lambda x :re.findall('://w?w?w?\.?([\w\-\.]+)', x)[0]) 
df["is_spam"] = df["is_spam"].astype(int)
df.sample(10)

Unnamed: 0,url,is_spam,len_url,contains_subscribe,contains_hash,num_digits,non_https,num_words,domain,proto
590,https://www.hollywoodreporter.com/live-feed/ji...,0,117,0,0,7,1,5,hollywoodreporter.com,https
1448,https://www.eventbrite.com/e/big-friendship-bo...,1,76,0,0,12,1,5,eventbrite.com,https
2750,https://www.nytimes.com/2020/06/29/science/fly...,0,69,0,0,8,1,8,nytimes.com,https
790,https://ew.com/movies/scott-pilgrim-vs-the-wor...,0,62,0,0,0,1,6,ew.com,https
30,https://link.morningbrew.com/manage/5z8/oc,1,42,0,0,2,1,6,link.morningbrew.com,https
1317,https://www.caltech.edu/about/news/where-are-m...,0,96,0,0,0,1,6,caltech.edu,https
2244,https://www.washingtonpost.com/national-securi...,0,222,0,0,24,1,9,washingtonpost.com,https
2737,https://www.theskimm.com/general/disclaimers-2...,1,72,0,0,5,1,5,theskimm.com,https
1116,https://www.nytimes.com/2020/06/12/arts/music/...,0,81,0,0,8,1,9,nytimes.com,https
574,https://www.theguardian.com/environment/galler...,0,127,0,0,6,1,9,theguardian.com,https


In [28]:
df['domain'].value_counts()


morningbrew.com               194
nytimes.com                   112
youtube.com                    98
reuters.com                    67
cnn.com                        66
                             ... 
hirshhorn.si.edu                1
bonappetit.com                  1
framestrategy.co                1
framestrategy.substack.com      1
smartcitiesworld.net            1
Name: domain, Length: 764, dtype: int64

In [36]:
df['proto'].value_counts()

https    2945
http       54
Name: proto, dtype: int64