In [1]:
#importing useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score ,precision_score, recall_score ,f1_score , confusion_matrix
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import ExtraTreesClassifier
import pickle
import re
import nltk
nltk.download('punkt')
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
from scipy.sparse import hstack
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Defining Useful Functions
def data_preprocessing(text):
        '''
        Function to prepreocess and clean the data
        '''
        # lowercasing the text
        text = text.lower()

        # Expanding Contractions(Decontractions)
        def decontracted(text):
            '''
            Function to expand the contractions
            '''
            # specific
            text = re.sub(r"won't", "will not", text)
            text = re.sub(r"can\'t", "can not", text)

            # general
            text = re.sub(r"n\'t", " not", text)
            text = re.sub(r"\'re", " are", text)
            text = re.sub(r"\'s", " is", text)
            text = re.sub(r"\'d", " would", text)
            text = re.sub(r"\'ll", " will", text)
            text = re.sub(r"\'t", " not", text)
            text = re.sub(r"\'ve", " have", text)
            text = re.sub(r"\'m", " am", text)
            return text
        text = decontracted(text)

        # remove text in square brackets
        text = re.sub('\[.*?\]', '', text)

        # remove links
        text = re.sub('https?://\S+|www\.\S+', '', text)

        # remove punctuation and special characters
        def remove_punctuation(text):
            '''
            Function to remove special character and punctuation from text
            '''
            text = nltk.word_tokenize(text)
            lst = []
            for i in text:
                if i not in string.punctuation:
                    lst.append(i)
            text = lst[:]
            return ' '.join(lst)
        text = remove_punctuation(text)

        # remove stopwords
        def remove_stopwords(text):
            '''
            Function to rmove stopwords from a given text
            '''
            lst = []
            for char in text.split():
                if char not in stopwords.words('english'):
                    lst.append(char)
            return ' '.join(lst)

        text = remove_stopwords(text)

        # remove digits
        def remove_digits(text):
            '''
            Fuction to remove digits from a given text
            '''
            lst = []
            for char in text:
                if not char.isdigit():
                    lst.append(char)
            return ''.join(lst)
        text = remove_digits(text)

        # remove everything except alphabet
        text = re.sub(r'[^a-zA-Z ]+', '', text)

        # remove extra spaces from the text
        text = re.sub(' +', ' ', text)

        # Stemming
        ps = PorterStemmer()
        text = ' '.join(ps.stem(word) for word in text.split(' '))

        return text


        # Function to compute Digits Count Feature
def digits_count(string):
    '''
    Function to compute number of digits in a given text
    '''
    digit_count = 0
    for i in range(len(string)):
        if (string[i].isalpha()):
            continue
        elif (string[i].isdigit()):
            digit_count += 1
        elif (string[i] == ' '):
            continue
        else:
            continue
    return digit_count

In [3]:
def final_fun_1(input_message):
    '''
    Entie pipeline of machine learning project from data poreprocesing to making final predictions
    takes raw data as input and returns the predictions of the input
    '''
   # Computing Length feature from input message
    length = len(input_message)

    # Computing digit count feature from input message
    digits = digits_count(input_message)

    # Text preprocessing
    transformed_sms = data_preprocessing(input_message)

    # Computing Length feature from input message
    length = len(input_message)

    # Vectorizing the text data
    tfidf = pickle.load(open('vectorizer.pkl','rb'))
    vectorizer = tfidf.transform([transformed_sms])

    # Stacking Features
    features = hstack([vectorizer,length,digits]).toarray()

    # Prediction
    model = pickle.load(open('model.pkl','rb'))
    result = model.predict(features)[0]

    if result == 1:
      return "This is SPAM message"
    else:
      return "This is NOT a spam message"


In [4]:
final_fun_1("URGENT! You have won a 1 week FREE membership in our $100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18")

'This is SPAM message'

In [5]:
final_fun_1("Pls go ahead with watts. I just wanted to be sure. Do have a great weekend. Abiola")

'This is NOT a spam message'

In [6]:
final_fun_1("Thanks for your subscription to Ringtone UK your mobile will be charged $5/month Please confirm by replying YES or NO. If you reply NO you will not be charged")

'This is SPAM message'

In [7]:
final_fun_1("U don't know how stubborn I am. I didn't even want to go to the hospital. I kept telling Mark I'm not a weak sucker. Hospitals are for weak suckers.")

'This is NOT a spam message'

In [8]:
#Reading data
data = pd.read_csv("final_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,target,text,length,word_count,sentence_count,digits_count,uppercase_words_count,unique_words_count,average_word_length,preprocessed_text
0,0,0,"Go until jurong point, crazy.. Available only ...",111,24,2,0,0,20,4.625,go jurong point crazi avail bugi n great world...
1,1,0,Ok lar... Joking wif u oni...,29,8,2,0,0,6,3.625,ok lar joke wif u oni
2,2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2,25,2,24,4.189189,free entri wkli comp win fa cup final tkt st m...
3,3,0,U dun say so early hor... U c already then say...,49,13,1,0,2,10,3.769231,u dun say earli hor u c alreadi say
4,4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1,0,1,12,4.066667,nah think goe usf live around though


In [9]:
# Loading Preprocessed data
data2 = pd.read_csv("preprocessed_data.csv")
data2.head()

Unnamed: 0,preprocessed_text,length,word_count,sentence_count,unique_words_count,average_word_length,target
0,go jurong point crazi avail bugi n great world...,77,16,1,16,4.8125,0
1,ok lar joke wif u oni,22,6,1,6,3.666667,0
2,free entri wkli comp win fa cup final tkt st m...,104,22,1,20,4.727273,1
3,u dun say earli hor u c alreadi say,36,9,1,7,4.0,0
4,nah think goe usf live around though,36,7,1,7,5.142857,0


In [10]:
# Adding preprocessed length column to final data
data['preprocessed_length'] = data2['length']

In [13]:
# Checking null values
null_data = data[data.isnull().any(axis=1)]
null_data

Unnamed: 0.1,Unnamed: 0,target,text,length,word_count,sentence_count,digits_count,uppercase_words_count,unique_words_count,average_word_length,preprocessed_text,preprocessed_length
248,252,0,What you doing?how are you?,27,8,1,0,0,5,3.375,,0
939,959,0,Where @,7,2,1,0,0,2,3.5,,0
1560,1611,0,645,3,1,1,3,0,1,3.0,,0
2675,2805,0,Can a not?,10,4,1,0,0,3,2.5,,0
3191,3374,0,:),3,2,1,0,0,1,1.5,,0
4276,4573,0,:( but your not here....,24,7,1,0,0,5,3.428571,,0
4500,4822,0,:-) :-),7,6,1,0,0,1,1.166667,,0


In [14]:
# Dropping null rows
data = data.dropna()

In [15]:
# Checking null values
null_data = data[data.isnull().any(axis=1)]
null_data

Unnamed: 0.1,Unnamed: 0,target,text,length,word_count,sentence_count,digits_count,uppercase_words_count,unique_words_count,average_word_length,preprocessed_text,preprocessed_length


In [17]:
# Seperating Class label and Depe
X = data.iloc[:,2:]
X

Unnamed: 0,text,length,word_count,sentence_count,digits_count,uppercase_words_count,unique_words_count,average_word_length,preprocessed_text,preprocessed_length
0,"Go until jurong point, crazy.. Available only ...",111,24,2,0,0,20,4.625000,go jurong point crazi avail bugi n great world...,77
1,Ok lar... Joking wif u oni...,29,8,2,0,0,6,3.625000,ok lar joke wif u oni,22
2,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2,25,2,24,4.189189,free entri wkli comp win fa cup final tkt st m...,104
3,U dun say so early hor... U c already then say...,49,13,1,0,2,10,3.769231,u dun say earli hor u c alreadi say,36
4,"Nah I don't think he goes to usf, he lives aro...",61,15,1,0,1,12,4.066667,nah think goe usf live around though,36
...,...,...,...,...,...,...,...,...,...,...
5164,This is the 2nd time we have tried 2 contact u...,161,35,4,21,2,26,4.600000,nd time tri contact u u pound prize claim easi...,79
5165,Will Ì_ b going to esplanade fr home?,37,9,1,0,1,8,4.111111,b go esplanad fr home,22
5166,"Pity, * was in mood for that. So...any other s...",57,15,2,0,0,10,3.800000,piti mood suggest,17
5167,The guy did some bitching but I acted like i'd...,125,27,1,0,1,26,4.629630,guy bitch act like would interest buy someth e...,71


In [18]:
y = data['target'].values

In [19]:
# Train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2,stratify = y)

In [57]:
print(X_test.shape)
print(y_test.shape)

(1033, 10)
(1033,)


In [91]:
def final_fun_2(x , y):
    '''
    Entie pipeline of machine learning project from data poreprocesing to making final predictions
    takes raw data as input along with class label and retuens the perfomance of the model
    '''

    # Vectorizing the text data
    tfidf = pickle.load(open('vectorizer.pkl','rb'))
    vectorizer = tfidf.transform(x['preprocessed_text'])

    # Stacking Features
    features = hstack([vectorizer,x['preprocessed_length'].values.reshape(-1,1),x['digits_count'].values.reshape(-1,1)]).toarray()

    # Loading best model (Extra tree classifier with tfidf + digit_count feature)
    model = pickle.load(open('model.pkl','rb'))
    
    y_pred = model.predict(features)

    return accuracy_score(y,y_pred) ,precision_score(y,y_pred) , f1_score(y,y_pred)

In [92]:
accuracy , precision , F1_score = final_fun_2(X_test,y_test) # Here we are passing the test data along with its target
print("Accuracy : ", accuracy)
print("Precision : ", precision)
print("F1_score : ", F1_score)

Accuracy :  0.9932236205227493
Precision :  1.0
F1_score :  0.9725490196078431
