<a href="https://colab.research.google.com/github/sgajendra/NLP-Repository/blob/main/SPAM_HAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import os
import io
import pandas as pd
tf.__version__

import re
import keras

In [None]:
# Download the zip file
path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip",
origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",
                  extract=True)
# Unzip the file into a folder
!unzip $path_to_zip -d data

Archive:  /root/.keras/datasets/smsspamcollection.zip
replace data/SMSSpamCollection? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace data/readme? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
# Let's see if we read the data correctly
lines = io.open('data/SMSSpamCollection').read().strip().split('\n')

In [None]:
lines[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
label_data = []
sentence_data = []
for line in lines:
  label, sent = line.split('\t')
  if label.strip()=='spam':
    label_data.append('1')
    sentence_data.append(sent.strip())
  else:
    label_data.append('0')
    sentence_data.append(sent.strip())

In [None]:
data = pd.DataFrame({'label': label_data,
                     'text': sentence_data,
                     })

In [None]:
data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data.text[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
def len_text(text):
  return len(text)

def num_capital(text):
  _, count = re.subn('[A-Z]', '', text)
  return count

def num_pantuation(text):
  _, count = re.subn('\W', '', text)
  return count

In [None]:
data['length'] = data['text'].apply(lambda x: len_text(x))
data['capitals'] = data['text'].apply(lambda x: num_capital(x))
data['pantuation'] = data['text'].apply(lambda x: num_pantuation(x))
data['label'] = data['label'].apply(lambda x: int(x))

In [None]:
data.head()

Unnamed: 0,label,text,length,capitals,pantuation
0,0,"Go until jurong point, crazy.. Available only ...",111,3,28
1,0,Ok lar... Joking wif u oni...,29,2,11
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,10,33
3,0,U dun say so early hor... U c already then say...,49,2,16
4,0,"Nah I don't think he goes to usf, he lives aro...",61,2,14


In [None]:
data.dtypes

label          int64
text          object
length         int64
capitals       int64
pantuation     int64
dtype: object

In [None]:
train=data.sample(frac=0.8,random_state=42)
test=data.drop(train.index)
x_train = train[['length', 'capitals', 'pantuation']]
y_train = train[['label']]
x_test = test[['length', 'capitals', 'pantuation']]
y_test = test[['label']]

In [None]:
# Basic 1-layer neural network model for evaluation
def make_model(input_dims=3, num_units=12):
  model = tf.keras.Sequential()
  # Adds a densely-connected layer with 12 units to the model:
  model.add(tf.keras.layers.Dense(num_units, 
                                  input_dim=input_dims,
                                  activation='relu'))
  # Add a sigmoid layer with a binary output unit:
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', 
                metrics=['accuracy'])
  return model

In [None]:
model = make_model()
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f17f1e109d0>

In [None]:
model.evaluate(x_test, y_test)



[0.2085690200328827, 0.9228699803352356]

In [None]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

doc = nlp(''.join(str(data.text.to_list()))) 

In [None]:
#pip install stanfordnlp

In [None]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
data.head()

Unnamed: 0,label,text,length,capitals,pantuation,preprocess
0,0,"Go until jurong point, crazy.. Available only ...",111,3,28,jurong point crazi avail bugi great world buff...
1,0,Ok lar... Joking wif u oni...,29,2,11,joke
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,10,33,free entri wkli comp final text receiv entri q...
3,0,U dun say so early hor... U c already then say...,49,2,16,earli alreadi
4,0,"Nah I don't think he goes to usf, he lives aro...",61,2,14,think live around though


In [None]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
st = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

In [None]:
data['preprocess'] = data['text']

In [None]:
data['preprocess'] = data['preprocess'].apply(lambda x: ' '.join(i.lower() for i in x.split()))
data['preprocess'] = data['preprocess'].str.replace('[^\w\s]'," ")
data['preprocess'] = data['preprocess'].apply(lambda x: ' '.join(i for i in x.split() if i not in stop))
data['preprocess'] = data['preprocess'].apply(lambda x: ' '.join([st.stem(word) for word in x.split()]))
data['preprocess'] = data['preprocess'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
data['preprocess'] = data['preprocess'].str.replace('\d'," ")
data['preprocess'] = data['preprocess'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 3]))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
data['preprocess'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'