In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load essential libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D

In [None]:
# data converted to data frame. A Data frame is a two-dimensional data structure
data = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding = 'latin1')
data.head()

# Drop features having NaN

In [None]:
# drop unavailable attributes

data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)



In [None]:
# rename and display first five columns 

data = data.rename(columns ={"v1":"target", "v2":"text"})
data.head()

In [None]:
#Count number of ham and spam

data.target.value_counts()

In [None]:
#plotting  distribution graph

sns.countplot(x = "target", data = data)
data.loc[:, 'target'].value_counts()
plt.title('Distribution of Spam and Ham')

In [None]:
# plotting graph by length of text

ham =data[data['target'] == 'ham']['text'].str.len()
sns.distplot(ham, label='Ham')
spam = data[data['target'] == 'spam']['text'].str.len()
sns.distplot(spam, label='Spam')
plt.title('Distribution by Length')
plt.legend()

In [None]:
#split the data into train and test (80-20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size = 0.2, random_state = 37)
print ("X_train: ", len(X_train))
print("X_test: ", len(X_test))
print("y_train: ", len(y_train))
print("y_test: ", len(y_test))

In [None]:
#for counting frequently occurence of spam and ham.

count1 = Counter(" ".join(data[data['target']=='ham']["text"]).split()).most_common(10)
data1 = pd.DataFrame.from_dict(count1)
data1 = data1.rename(columns={0: "words of ham", 1 : "count"})

#Graph of top 30 words of HAM
data1.plot.bar(legend = False, color = 'purple',figsize = (20,15))
y_pos = np.arange(len(data1["words of ham"]))
plt.xticks(y_pos, data1["words of ham"])
plt.title('Top 10 words of ham')
plt.xlabel('words')
plt.ylabel('number')
plt.show()

# Datasets of natural language are referred to as corpora, and a single set of data annotated with the same specification is called an annotated corpus. Annotated corpora can be used to train ML algorithms.

In [None]:
#for removing the commoner morphological and inflexional endings from words in English.
#like playing, played, plays--> common root play
corpus = []
#print(data['text'][0])
#review = re.sub('[^a-zA-Z]', ' ', data['text'][0])
#review = review.lower()
#review = review.split()
#print(review)
#print()

for i in range(0, 5572): # for the entire data set 4825+747=5572
    review = re.sub('[^a-zA-Z]', ' ', data['text'][i])
    review = review.lower()
    review = review.split()
    #Stemming is the process of reducing inflection in words to their root forms such as mapping 
    #a group of words to the same stem even if the stem itself is not a valid word in the Language
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #Stopwords are the English words which does not add much meaning to a sentence. 
    #They can safely be ignored without sacrificing the meaning of the sentence.
    review = ' '.join(review)
    corpus.append(review)
#print(corpus[0])


In [None]:
#to tokenize a collection of text documents and build a vocabulary of known words
#use it as follows:

#Create an instance of the CountVectorizer class.
#Call the fit() function in order to learn a vocabulary from one or more documents.
#Call the transform() function on one or more documents as needed to encode each as a vector.
cv = CountVectorizer(max_features = 1500)
cv.fit(X_train)

In [None]:
X_train_cv = cv.transform(X_train)
X_train_cv

In [None]:
X_test_cv = cv.transform(X_test)
X_test_cv


# Creating model Naive Bayes Classifier


In [None]:
mnb = MultinomialNB(alpha = 0.5)
mnb.fit(X_train_cv,y_train)

y_mnb = mnb.predict(X_test_cv)

In [None]:
print('Naive Bayes Accuracy: ', accuracy_score( y_mnb , y_test))
print('Naive Bayes confusion_matrix: ', confusion_matrix(y_mnb, y_test))

# SVM Classification

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(X_train_cv,y_train)
y_svc = svc.predict(X_test_cv)

In [None]:
print('SVM Accuracy: ', accuracy_score( y_svc , y_test))
print('SVM confusion_matrix: ', confusion_matrix(y_svc, y_test))

# Random Forest Classification

In [None]:
rfc = RandomForestClassifier(n_estimators=37, random_state=252)
rfc.fit(X_train_cv,y_train)
y_rfc = rfc.predict(X_test_cv)

In [None]:
print('Random Forest Accuracy_score: ',accuracy_score(y_test,y_rfc))
print('Random Forest confusion_matrix: ', confusion_matrix(y_rfc, y_test)) 

# Naive Bayes giving the best accuracy

# DEEP LEARNING

In [None]:
## For enumeration up to a maximum of 1000
##The enumerate() function assigns an index to each item in an iterable object 
##that can be used to reference the item later
tags = data["target"]
texts = data["text"]

num_max = 1000

## Tags make 0 and 1 .....spam or ham
#LabelEncoder encode labels with a value between 0 and n_classes-1 
#where n is the number of distinct labels.

#fit_transform is used on training data to scale the data and also learn the scaling parameters.
# We do that on the training set of data. But then you have to apply the same transformation to your 
#testing set (e.g. in cross-validation), or to newly obtained examples before forecast. 
#But you have to use the exact same two parameters μ and σ (values) that you used for centering 
#the training set.... x'= x-mu/ sigma
le = LabelEncoder()
tags = le.fit_transform(tags)

## The process of enumerating words,tokenizer class allows an application to break a string into tokens.
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(texts)

# Number of word counts
#print(tok.word_docs)

In [None]:
## A maximum of 100 words and sentences are maintained
max_len = 100
cnn_texts_seq = tok.texts_to_sequences(texts)
for i in range(len(cnn_texts_seq)):
    if(len(cnn_texts_seq[i])>100):
        print('Word Counts:', len(cnn_texts_seq[i]),'Indeks:',i)

In [None]:
## A maximum of 100 words and sentences are maintained
## The number of words is made from 100. Missing words are written to 0.
##Pad_sequences-This function transforms a list (of length num_samples) 
##of sequences (lists of integers) into a 2D Numpy array of shape 
##(num_samples, num_timesteps). num_timesteps is either the maxlen argument if provided,
##or the length of the longest sequence in the list.

cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=max_len)

## There are 23 words in the second sentence.
## All words are indexed
## The most used 1000 words are taken.
## Less used words are removed.
## If the number of words is less than 100, 0 is added for padding. 
## If the number of words is greater than 100, then they are deleted
print('***************************************************')
print(texts[2156])
print()
print(cnn_texts_mat[2156])
print('***************************************************')
print('hey index:',tok.word_index['hey'])
print('cutie index:',tok.word_index['cutie'])


In [None]:
## Number of words 101
## 
## There are 100 words left.
print('***************************************************')
print(texts[2156])
print('***************************************************')
print(cnn_texts_mat[2156])
print('***************************************************')

print('hey index:',tok.word_index['hey'])#, 'WALES index:',tok.word_index['WALES')

In [None]:
#conv1d
#The model extracts features from sequences data and maps the internal features of the sequence.
#1st laye - sequential - A Sequential model is appropriate for a plain 
#stack of layers where each layer has exactly one input tensor and one output tensor.

model = Sequential()
model.add(Embedding(1000,20,input_length=max_len))
#embedding layer takes ids per row and convert it into one hot encoding
# with a vocabulary of 1000 (0- 999),a vector space of 20 dimensions in which words will be embedded,
#and input documents that have max_len i.e. 100.
model.add(Dropout(0.2))
#dropout- ignoring neurons during the training phase, there will be no
#forard or backword pass by these neurons
model.add(Conv1D(64,5,padding='valid',activation='relu',strides=1))
# A 1D CNN is very effective for deriving features from a fixed-length segment of the overall dataset,
#where it is not so important where the feature is located in the segment.
#ReLU-Rectified lines activation Functio
#ReLU for short is a piecewise linear function that will output the input directly
#if it is positive, otherwise, it will output zero.
#overcomes the vanishing gradient problem, allowing models to learn faster and perform better.
model.add(GlobalMaxPooling1D())
#aking the maximum value over the time dimension
model.add(Dense(128,activation='relu'))
#A Dense layer feeds all outputs from the previous layer to
#all its neurons, each neuron providing one output to the next layer.
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
#sigmoid  The input to the function is transformed into a value between 0.0 and 1.0.
model.summary() #Prints a string summary of the network.
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['acc'])
#Binary crossentropy is a loss function that is used in binary classification tasks. 
#These are tasks that answer a question with only two choices

In [None]:
history=model.fit(cnn_texts_mat,tags,batch_size=32,epochs=10,verbose=1,validation_split=0.2)


In [None]:
import matplotlib.pyplot as plt
epochs = range(1, 11)
acc = history.history['acc']
val_acc = history.history['val_acc']

plt.plot(epochs, acc, 'b+', label='Acc')
plt.plot(epochs, val_acc, 'bo', label='Val Acc')#validation accuracy
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# EPOCH 10 - val_loss: 0.0623 - val_acc: 0.9857****