In [1]:
!pip3 install pydataset

Collecting pydataset
Collecting pandas (from pydataset)
  Using cached https://files.pythonhosted.org/packages/a2/21/e10d65222d19a2537e3eb0df306686a9eabd08b3c98dd120e43720bf802d/pandas-1.1.3-cp36-cp36m-manylinux1_x86_64.whl
Collecting numpy>=1.15.4 (from pandas->pydataset)
  Using cached https://files.pythonhosted.org/packages/b8/e5/a64ef44a85397ba3c377f6be9c02f3cb3e18023f8c89850dd319e7945521/numpy-1.19.2-cp36-cp36m-manylinux1_x86_64.whl
Collecting python-dateutil>=2.7.3 (from pandas->pydataset)
  Using cached https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl
Collecting pytz>=2017.2 (from pandas->pydataset)
  Using cached https://files.pythonhosted.org/packages/4f/a4/879454d49688e2fad93e59d7d4efda580b783c745fd2ec2a3adf87b0808d/pytz-2020.1-py2.py3-none-any.whl
Collecting six>=1.5 (from python-dateutil>=2.7.3->pandas->pydataset)
  Using cached https://files.pythonhosted.org/packages/ee/ff/

In [2]:
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data
import re
import nltk

%matplotlib inline

In [None]:
nltk.download('stopwords')

In [4]:
# Tests to see if we are running in the Google Colaboratory environment
# If so, use an https URL to access the data.  Otherwise, load via the file path
try:
  import google.colab
  data_file_prefix = "https://raw.githubusercontent.com/slankas/DataScienceNotebooks/master//FeatureCreation/"
  import plotly.io as pio
  pio.renderers.default = 'colab'
except:
  data_file_prefix = ""

In [5]:
df = pd.read_csv(data_file_prefix+"data/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Ci..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 t...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [6]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [8]:
df['cleanText'] = df['Message'].apply(lambda x: normalize_document(x))
df.head()

Unnamed: 0,Category,Message,cleanText
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Ci...",go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 t...,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think goes usf lives around though


In [9]:
import collections
results = collections.Counter()
df['cleanText'].str.split().apply(results.update)
for word, count in results.most_common(10):
    print(word,": ",count)

u :  1130
call :  575
2 :  482
im :  473
ur :  390
get :  386
dont :  298
4 :  293
go :  281
ok :  278


# Bag of Words Model


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(df['cleanText'])
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Bag of N-Grams Model

In [12]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(df['cleanText'])
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,008704050406 sp,0089my last,0121 2025050,01223585236 xx,01223585334 cum,0125698789 ring,02 user,020603 2nd,0207 153,02072069400 bx,...,zed pobox,zeros savings,zhong se,zindgi wo,zoe 18,zoe hit,zogtorius ive,zoom cine,zouk nichols,zyada kisi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# TF - IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(df['cleanText'])
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# document similary via cosine

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5562,5563,5564,5565,5566,5567,5568,5569,5570,5571
0,1.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.036959,0.0,0.0,0.0,0.000000,0.0
1,0.0,1.0,0.000000,0.0,0.0,0.040733,0.000000,0.000000,0.000000,0.000000,...,0.047576,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.034093,...,0.000000,0.000000,0.000000,0.0,0.039992,0.0,0.0,0.0,0.020001,0.0
3,0.0,0.0,0.000000,1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.137548,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.000000,0.0,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.060819,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.035963,0.117038,0.015250,...,0.000000,0.000000,0.000000,0.0,0.016760,1.0,0.0,0.0,0.000000,0.0
5568,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,1.0,0.0,0.000000,0.0
5569,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,1.0,0.000000,0.0
5570,0.0,0.0,0.020001,0.0,0.0,0.083140,0.073823,0.000000,0.000000,0.051458,...,0.000000,0.066952,0.000000,0.0,0.028277,0.0,0.0,0.0,1.000000,0.0


In [17]:
# cluster via document similarity

In [18]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=25)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([df, cluster_labels], axis=1)

Unnamed: 0,Category,Message,cleanText,ClusterLabel
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Ci...",go jurong point crazy available bugis n great world la e buffet cine got amore wat,1
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,8
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 t...,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry...,1
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,1
4,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think goes usf lives around though,20
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 cla...,2nd time tried 2 contact u u 750 pound prize 2 claim easy call 087187272008 now1 10p p...,6
5568,ham,Will ü b going to esplanade fr home?,b going esplanade fr home,15
5569,ham,"Pity, * was in mood for that. So...any other suggestions?",pity mood soany suggestions,1
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else ...,guy bitching acted like id interested buying something else next week gave us free,1


In [19]:
# Bi-grams

In [20]:
bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\b\\w+\\b')
x2 = bigram_converter.fit_transform(df['cleanText'])

In [21]:
bigrams = bigram_converter.get_feature_names()
print(len(bigrams))
bigrams[-10:]

33372


['zed pobox',
 'zeros savings',
 'zhong se',
 'zindgi wo',
 'zoe 18',
 'zoe hit',
 'zogtorius ive',
 'zoom cine',
 'zouk nichols',
 'zyada kisi']