In [1]:
# Necessary Libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
import nltk
import os
from tqdm import tqdm

In [2]:
try:
    tokenizer_path = nltk.data.find('tokenizers/punkt')
    print("Found punkt tokenizer at {}".format(tokenizer_path))
except LookupError:
    print("Downloading tokenizer")
    nltk.download('punkt')

Found punkt tokenizer at C:\Users\User1\AppData\Roaming\nltk_data\tokenizers\punkt\PY3


In [3]:
#Download Train data and test data
def downloadCSV(url,filename):
    if checkExists(filename=filename):
        exit()
    else:
        # Streaming, so we can iterate over the response.
        response = requests.get(url, stream=True)
        total_size_in_bytes= int(response.headers.get('content-length', 0))
        block_size = 1024 #1 Kibibyte
        progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
        with open(filename, 'wb') as file:
            for data in response.iter_content(block_size):
                progress_bar.update(len(data))
                file.write(data)
        progress_bar.close()

def checkExists(filename):
    if filename in os.listdir("./"):
        print("{} already exists in path {}".format(filename,os.getcwd()))
        return True
    else:
        print("{} not found in current directory {}".format(filename,os.getcwd()))
        print("Downloading {}".format(filename))
        return False

In [5]:
train_csv_url = "https://raw.githubusercontent.com/deepraj1729/Spam-classification-Text-dataset/newbranch/data/train.csv"
test_csv_url = "https://raw.githubusercontent.com/deepraj1729/Spam-classification-Text-dataset/newbranch/data/test.csv"

downloadCSV(train_csv_url,'train.csv')
downloadCSV(test_csv_url,'test.csv')

train.csv not found in current directory g:\NEWS-CLASSIFIER\training
Downloading train.csv
98.6MiB [06:15, 263kiB/s]
test.csv not found in current directory g:\NEWS-CLASSIFIER\training
Downloading test.csv
25.1MiB [01:09, 362kiB/s]


In [6]:
# Importing the dataset
RAW_PATH = r"train.csv"
DATA_PATH = os.path.join("./",RAW_PATH)

#Read from CSV
df = pd.read_csv(DATA_PATH)

In [7]:
df.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


In [8]:
# Let's see the shape of our dataset
df.shape

(20800, 5)

In [9]:
# Info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [10]:
# checking for NULL values for each column
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [11]:
# Input column - "text"
# Output Column - "label"
# "text" column has 39 missing data rows
# but you see total 20800 text rows, so, dropping these 39 rows will do no effect

df.dropna(subset = ["text"], inplace=True)

In [12]:
df.shape

(20761, 5)

In [13]:
#checking Class distribution
from collections import Counter

print(Counter(df['label']))

Counter({0: 10387, 1: 10374})


In [14]:
#Fairly Balanced ;)

In [15]:
# Input- X Output- Y

X = df['text']
y = df['label']

In [13]:
def preprocess(data_X):
    # Replace email addresses with 'emailaddr'
    X = data_X.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')

    # Replace URLs with 'webaddr'
    X = X.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddr')

    # Replace Currency symbols with 'currsymb' 
    X = X.str.replace(r'£|\$|₹', 'currsymb')

    # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenbr'
    X = X.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenbr')

    # Replace numbers with 'numbr'
    X = X.str.replace(r'\d+(\.\d+)?', 'numbr')

    # Remove punctuation
    X = X.str.replace(r'[^\w\d\s]', ' ')

    # Replace whitespace between terms with a single space
    X = X.str.replace(r'\s+', ' ')

    # Remove leading and trailing whitespace
    X = X.str.replace(r'^\s+|\s+?$', '')

    # To lowercase
    X = X.str.lower()

    

    # Stemming words (removing ing, ed ...)
    # ps = nltk.PorterStemmer()
    # X = X.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

    return X



# Call the Preprocess function
X_preprocessed = preprocess(X)

In [14]:
# let's split the data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.25)

In [15]:
# now let's create a pipeline for removing stopwords and create bag of words
# applying multinomialNB as it gives us better results

model = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('nbmodel', MultinomialNB())])

In [16]:
# fit the model
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nbmodel',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [17]:
prediction = model.predict(X_test)

In [18]:
print(classification_report(y_test, prediction))

precision    recall  f1-score   support

           0       0.78      0.99      0.87      2579
           1       0.98      0.72      0.83      2612

    accuracy                           0.85      5191
   macro avg       0.88      0.85      0.85      5191
weighted avg       0.88      0.85      0.85      5191



In [19]:
print(confusion_matrix(y_test, prediction))

[[2550   29]
 [ 731 1881]]


In [21]:
# We see that the Fasle Positive (36) and False Negative (664) are a bit high
# Some other approaches may help

In [20]:
# make the pickle file
with open('../saved_model/model1.pickle', 'wb') as target:
    pickle.dump(model, target, protocol=pickle.HIGHEST_PROTOCOL)