# NLP Project

In our last exploring NLP notebook we built an email spam detector using Natural Language Processing techniques and the Support Vector Machine (SVM) algorithm for classification.
In this project, we will again build a spam detector but this time using URLs instead of emails.

In [3]:
!pip install -r ../requirements.txt

Collecting pandas
  Downloading pandas-1.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m137.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting numpy
  Downloading numpy-1.23.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m118.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting matplotlib
  Downloading matplotlib-3.5.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m159.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m157.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seaborn
  Downloading se

# Import

In [96]:
import pandas as pd
import numpy as np
import re

from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Load Data

In [29]:
#load data
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
df_raw = pd.read_csv(url)

In [6]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [7]:
df_raw.sample(10)

Unnamed: 0,url,is_spam
2009,https://www.npr.org/sections/health-shots/2020...,False
2005,https://www.politico.com/news/2020/06/29/sun-b...,False
2174,https://apnews.com/fcf3ec359401c7575f5402b4da9...,False
2729,https://apnews.com/8e2b3b7ca3bb2475b9ae901dad4...,False
1687,https://www.hvper.com/,True
2075,https://www.msn.com/en-us/news/us/the-black-of...,False
1529,https://www.cbsnews.com/news/bubba-wallace-nas...,False
1636,https://www.theatlantic.com/politics/archive/2...,False
1926,https://en.wikipedia.org/wiki/The_Lady%27s_Not...,False
1936,https://www.theskimm.com/picks/beach-products-...,True


In [8]:
#check the target
df_raw['is_spam'].value_counts()


False    2303
True      696
Name: is_spam, dtype: int64

In [30]:
# Check duplicates
print('Number of duplicated rows:',df_raw.duplicated().sum())  
df_raw = df_raw.drop_duplicates().reset_index(drop = True)
df_raw['is_spam'].value_counts()
#doubt: the data is more unbalanced 
#clean 452 spam, and only 178 not spam


Number of duplicated rows: 630


False    2125
True      244
Name: is_spam, dtype: int64

## PREPROCESS

In [79]:
#make a copy
df_interim = df_raw.copy()

In [80]:
def clean_data(urlData):
  
    #remove punctuation, digit, simbols
    urlData = re.sub('[^a-zA-Z]', ' ', urlData)
    
    #duplicate space
    urlData = re.sub(r'\s+', ' ',  urlData)
    #urlData=" ".join(urlData.split())

    urlData = re.sub(r'\b[a-zA-Z]\b', ' ',urlData)  #\b word boundary

    urlData = urlData.strip()   #remove space on right and left include tab
    return urlData

#print(df_interim['url'][2])
#clean_data(df_interim['url'][2])


In [81]:
#Texto a minúscula
df_raw['url'] = df_raw['url'].str.lower()
 
#clean-data
df_interim['url'] = df_interim['url'].apply(clean_data)
df_interim['url'].head(10)

0     https briefingday us list manage com unsubscribe
1                                  https www hvper com
2                                https briefingday com
3                https briefingday com     commentform
4                            https briefingday com fan
5    https www brookings edu interactives reopening...
6    https www reuters com investigates special rep...
7    https www theatlantic com magazine archive sup...
8    https www vox com john bolton book excerpts tr...
9    https www theguardian com travel jun end of to...
Name: url, dtype: object

In [82]:
#fuction to reove stopwords
stopWord = ['is','you','your','and', 'the', 'to', 'from', 'or', 'I', 'for', 'do', 'get', 'not', 'here', 'in', 'im', 'have', 'on',
're', 'https', 'com', 'of']  
#include https,  have http not secure urls, and other 
#include com, have other like edu

def remove_stopwords(urlData):
  if urlData is not None:
    words = urlData.strip().split()
    words_filtered = []
    for word in words:
      if word not in stopWord:
        words_filtered.append(word)
    result = " ".join(words_filtered) #hace un join elemento por elemento separados por espacio
  else:
      result = None
  return result

In [83]:
df_interim['url'] = df_interim['url'].apply(remove_stopwords)
df_interim.sample(10)

Unnamed: 0,url,is_spam
644,www youtube watch koaovbyfja,False
2109,vuoriclothing pages,False
2178,www npr org supreme court montana cant exclude...,False
415,www usatoday story news world huge circle anci...,False
1884,www politico news house democrats russia bount...,False
1344,www complex style kanye west taps mowalola ogu...,False
1747,www wsj articles amazon acquire self driving s...,False
2050,www theverge apple iphone power adapter charge...,False
380,www nytimes business cheese cheddar prices html,False
1927,news berkeley edu native amazonians americans ...,False


In [88]:
df_interim_values = df_interim['url'].str.split(expand=True).stack().value_counts()
print('Diferent values: ',df_interim_values.size)
df_interim_values[:60]
#Chequeamos si funciona

Diferent values: 5997


www            1512
html            296
news            274
us              248
coronavirus     172
org             146
article         131
morningbrew     105
story           105
nytimes         101
daily            99
stories          94
utm              90
youtube          89
trump            88
numlock          87
watch            86
new              76
world            68
substack         68
reuters          65
covid            63
index            61
briefingday      61
en               59
vox              59
cnn              58
iduskbn          58
articles         58
co               56
politics         56
cnbc             54
sunday           51
business         49
court            48
apnews           47
email            46
facebook         46
health           45
supreme          41
bbc              41
be               41
are              40
blog             40
medium           39
black            39
police           38
npr              38
digg             37
with             37


# MODEL

In [89]:
df = df_interim.copy()

In [90]:
X = df['url']
y = df['is_spam'] 

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [93]:
#Vectorizador
vec = CountVectorizer()

#create matrix
X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [94]:
#create the model using SVC
svclassifier = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svclassifier.fit(X_train, y_train)

In [95]:
#Predicted using test data
y_pred = svclassifier.predict(X_test)

In [97]:
#evalute using confusion matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[516  16]
 [ 23  38]]
              precision    recall  f1-score   support

       False       0.96      0.97      0.96       532
        True       0.70      0.62      0.66        61

    accuracy                           0.93       593
   macro avg       0.83      0.80      0.81       593
weighted avg       0.93      0.93      0.93       593



# GRIDSEARCHCV 

In [98]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1,1, 10], 'gamma': [1,0.1,0.01,0.001],'kernel': ['linear']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.8s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.7s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.7s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.8s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.9s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   1.7s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   1.8s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   1.7s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   1.8s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   1.7s
[CV] END ...................C=0.1, gamma=0.01, kernel=linear; total time=   1.7s
[CV] END ...................C=0.1, gamma=0.01, k

In [99]:

print(grid.best_estimator_)

SVC(C=1, gamma=1, kernel='linear')


In [100]:
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))#Output

[[516  16]
 [ 23  38]]
              precision    recall  f1-score   support

       False       0.96      0.97      0.96       532
        True       0.70      0.62      0.66        61

    accuracy                           0.93       593
   macro avg       0.83      0.80      0.81       593
weighted avg       0.93      0.93      0.93       593

