# Explore here

It's recommended to use this notebook for exploration purposes.

For example: 

1. You could import the CSV generated by python into your notebook and explore it.
2. You could connect to your database using `pandas.read_sql` from this notebook and explore it.

In [1]:
!pip install -r "../requirements.txt"

Collecting sqlalchemy==1.4.37
  Downloading SQLAlchemy-1.4.37-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting pymysql==1.0.2
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==1.4.2
  Downloading pandas-1.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m144.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25hCollecting python-dotenv==0.20.0
  Downloading python_dotenv-0.20.0-py3-none-any.whl (17 kB)
Collecting psycopg2-binary==2.9.3
  Downloading psycopg2_binary-

In [3]:
import re
import pandas as pd
import pickle
import numpy as np
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report


In [4]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')
df_raw

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True
...,...,...
2994,https://www.smartcitiesworld.net/news/news/dee...,False
2995,https://www.youtube.com/watch,True
2996,https://techcrunch.com/2019/07/04/an-optimisti...,False
2997,https://www.technologyreview.com/2019/12/20/13...,False


In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [6]:
df_raw.describe()

Unnamed: 0,url,is_spam
count,2999,2999
unique,2369,2
top,https://www.bloomberg.com/tosv2.html,False
freq,26,2303


In [7]:
df_raw.sample(20)

Unnamed: 0,url,is_spam
1734,https://www.newyorker.com/news/letter-from-sil...,False
2604,https://www.morningbrew.com/daily/stories/,True
952,https://thehill.com/homenews/senate/504303-dem...,False
549,https://www.bloomberg.com/tosv2.html,True
1373,https://docs.google.com/forms/d/e/1FAIpQLScC99...,True
2336,https://apnews.com/269b3de1af34e17c1941a514f78...,False
2592,https://www.amsterdam.nl/kunst-cultuur/atelier...,False
1202,https://npe.fb.com/2020/06/23/forecast-a-commu...,False
2660,https://www.morningbrew.com/account/qUyyLJmF4f...,True
218,https://pudding.cool/2020/05/travel-local/,False


In [9]:
# invalanced
df_raw['is_spam'].value_counts()

False    2303
True      696
Name: is_spam, dtype: int64

In [10]:
df_interin = df_raw.copy()
print(f'Duplicated rows: {df_interin.duplicated().sum()}')

Duplicated rows: 630


In [11]:
df_interin = df_interin.drop_duplicates().reset_index(drop=True)

In [12]:
df_interin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2369 entries, 0 to 2368
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2369 non-null   object
 1   is_spam  2369 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 20.9+ KB


In [14]:
# continue invalanced?
df_interin['is_spam'].value_counts()

False    2125
True      244
Name: is_spam, dtype: int64

In [15]:
# functions to clean the text
def comas(text):
    """
    Elimina comas del texto
    """
    return re.sub(',', ' ', text)

def espacios(text):
    """
    Elimina enters dobles por un solo enter
    """
    return re.sub(r'(\n{2,})','\n', text)

def minuscula(text):
    """
    Cambia mayusculas a minusculas
    """
    return text.lower()

def numeros(text):
    """
    Sustituye los numeros
    """
    return re.sub('([\d]+)', ' ', text)

def caracteres_no_alfanumericos(text):
    """
    Sustituye caracteres raros, no digitos y letras
    Ej. hola 'pepito' como le va? -> hola pepito como le va
    """
    return re.sub("(\\W)+"," ",text)

def comillas(text):
    """
    Sustituye comillas por un espacio
    Ej. hola 'pepito' como le va? -> hola pepito como le va?
    """
    return re.sub("'"," ", text)

def palabras_repetidas(text):
    """
    Sustituye palabras repetidas

    Ej. hola hola, como les va? a a ustedes -> hola, como les va? a ustedes
    """
    return re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

def esp_multiple(text):
    """
    Sustituye los espacios dobles entre palabras
    """
    return re.sub(' +', ' ',text)
    
def url(text):
    """
    Remove https
    """
    return re.sub(r'(https://www|https://)', '', text)

In [16]:
df_interin['clean_url'] = df_interin['url'].apply(url).apply(caracteres_no_alfanumericos).apply(esp_multiple)

In [17]:
df_interin.head()

Unnamed: 0,url,is_spam,clean_url
0,https://briefingday.us8.list-manage.com/unsubs...,True,briefingday us8 list manage com unsubscribe
1,https://www.hvper.com/,True,hvper com
2,https://briefingday.com/m/v4n3i4f3,True,briefingday com m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,False,briefingday com n 20200618 m commentform
4,https://briefingday.com/fan,True,briefingday com fan


In [18]:
df_interin['is_spam'] = df_interin['is_spam'].astype(int)

In [19]:
df_interin

Unnamed: 0,url,is_spam,clean_url
0,https://briefingday.us8.list-manage.com/unsubs...,1,briefingday us8 list manage com unsubscribe
1,https://www.hvper.com/,1,hvper com
2,https://briefingday.com/m/v4n3i4f3,1,briefingday com m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,0,briefingday com n 20200618 m commentform
4,https://briefingday.com/fan,1,briefingday com fan
...,...,...,...
2364,https://www.theverge.com/2020/6/29/21306889/di...,0,theverge com 2020 6 29 21306889 disney deepfa...
2365,https://www.smartcitiesworld.net/news/news/dee...,0,smartcitiesworld net news news deepfake techn...
2366,https://techcrunch.com/2019/07/04/an-optimisti...,0,techcrunch com 2019 07 04 an optimistic view o...
2367,https://www.technologyreview.com/2019/12/20/13...,0,technologyreview com 2019 12 20 131462 this s...


In [20]:
df = df_interin.copy()

In [22]:
X = df['clean_url']
y = df['is_spam']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [23]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [24]:
X_train.shape

(1776, 5644)

In [25]:
X_test.shape

(593, 5644)

In [26]:
classifier = SVC(C = 1.0, kernel = 'linear', gamma = 'auto')

classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       537
           1       0.81      0.62      0.71        56

    accuracy                           0.95       593
   macro avg       0.89      0.81      0.84       593
weighted avg       0.95      0.95      0.95       593



In [27]:
param_grid = {
    'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}
grid = GridSearchCV(SVC(random_state=42), param_grid,verbose=2)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   5.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   5.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   6.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   6.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   4.9s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   4.8s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   4.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   4.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   4.7s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   1.4s
[CV] END .....................C=0.1, gamma=1, k

In [28]:
grid.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

In [29]:
grid.best_estimator_

In [30]:
pred_grid = grid.best_estimator_.predict(X_test)
print(classification_report(y_test, pred_grid))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       537
           1       0.80      0.70      0.74        56

    accuracy                           0.95       593
   macro avg       0.88      0.84      0.86       593
weighted avg       0.95      0.95      0.95       593



In [31]:
best_model = classifier
pickle.dump(best_model, open('../models/best_model.pickle', 'wb')) 