In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading and Understanding the Data 

In [None]:
# ISO-8859-1 is single byte encoding for utf-8
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding = 'ISO-8859-1')
df.head()

In [None]:
# How many rows and columns 
df.shape

In [None]:
df.info()

In [None]:
# Finding null values 
df.isnull().sum()

We can see here that several features contain many null values 
>**Unanmed: 2**, **Unanmed: 3** and **Unanmed: 4**

In [None]:
# Drop features with large amount of missing values
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.v1.value_counts(dropna=False)

In [None]:
# One Hot Encode the target variable 
y = pd.get_dummies(df['v1'], drop_first = True)
df = pd.concat([df, y], axis=1) 
df.drop('v1', axis=1, inplace=True)

In [None]:
df.head()

## Pre-Processing

We will use Regular Expressions to remove and change some of the text data in v2,
**for example:** 
> Email addresses, Web addresses, numbers, money symbol, punctuation, etc.. 

In [None]:
# use regular expressions to replace email addresses, URLs, phone numbers, other numbers
# Replace email addresses with 'email'
processed = df.v2.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'email')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

# change words to lower case - Free, FREE, free are all the same word
processed = processed.str.lower()

# Now let's see our data 
print(processed)

Much better now let's remove other useless data such as stop words.
> Let's try removing stop words with **NLTK**

In [None]:
from nltk.corpus import stopwords

# Remove stop words from text messages
stop_words = set(stopwords.words('english'))
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

Now let's use stemming to reduce the words to their word stem. 
> Let's try stemming using the PorterStemmer from **NLTK**

In [None]:
from nltk.stem import PorterStemmer

# Remove word stems using a Porter stemmer
ps = PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

## Generating Features

As our data is so large and are categorical features (Most ML models only take numerical features) we can use The **Bag of words** approach, which will help us extract features from text data for machine learning algorithms.
> We will try creating a bag of words using Scikit learn

In [None]:
# Creating a bag of words using Scikit learn
# We will extract top 1500 common words as features
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500) # Extracting top 1500 text only as features 
X = cv.fit_transform(processed).toarray() 

As we can see now we have X as our features which consists of **5572** row and **1500** col which are the top 1500 common words 

In [None]:
X.shape

Now let's divide our data for our model 
> We will use **Scikit-learn**  

In [None]:
from sklearn.model_selection import train_test_split

# Divide our data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Building Model

Let's try several classification algorithms, i will try:
* Logistic Regression 
* K Nearest Neighbor Classifier 
* Decision Trees and Random Forest 
* Multinomial Naive Bayes Classifier

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train.values.ravel())
predictions = logreg.predict(X_test)

In [None]:
# to show the percession of the model
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print('Accuracy: {}% \n'.format(accuracy_score(y_test, predictions) * 100))
print('*'*100)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 1)

knn.fit(X_train, y_train.values.ravel())

pred = knn.predict(X_test)

print('Accuracy: {}% \n'.format(accuracy_score(y_test, pred) * 100))
print('*'*100)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()

dtree.fit(X_train, y_train.values.ravel())

pred = dtree.predict(X_test)


print('Accuracy: {}% \n'.format(accuracy_score(y_test, pred) * 100))
print('*'*100)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(X_train, y_train.values.ravel())
rfc_pred = rfc.predict(X_test)

print('Accuracy: {}% \n'.format(accuracy_score(y_test, pred) * 100))
print('*'*100)
print(confusion_matrix(y_test, rfc_pred))
print(classification_report(y_test, rfc_pred))

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train.values.ravel())

pred = mnb.predict(X_test)

print('Accuracy: {}% \n'.format(accuracy_score(y_test, pred)* 100))
print('*'*100)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))