In [1]:
# Necessary Libraries
import pandas as pd
import numpy as np
import pickle
import nltk
import os

In [2]:
try:
    tokenizer_path = nltk.data.find('tokenizers/punkt')
    print("Found punkt tokenizer at {}".format(tokenizer_path))
except LookupError:
    print("Downloading tokenizer")
    nltk.download('punkt')

Found punkt tokenizer at C:\Users\User1\AppData\Roaming\nltk_data\tokenizers\punkt\PY3


In [3]:
# Importing the dataset
TEST_RAW_PATH = r"test.csv"
DATA_PATH = os.path.join("./",TEST_RAW_PATH)

#Read from CSV
df = pd.read_csv(DATA_PATH)

In [4]:
df.head(10)

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
5,20805,Trump is USA's antique hero. Clinton will be n...,,Trump is USA's antique hero. Clinton will be n...
6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,"Sunday on NBC’s “Meet the Press,” House Minori..."
7,20807,Weekly Featured Profile – Randy Shannon,Trevor Loudon,You are here: Home / *Articles of the Bound* /...
8,20808,Urban Population Booms Will Make Climate Chang...,,Urban Population Booms Will Make Climate Chang...
9,20809,,cognitive dissident,don't we have the receipt?


In [5]:
# Let's see the shape of our dataset
df.shape

(5200, 4)

In [6]:
# Info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
id        5200 non-null int64
title     5078 non-null object
author    4697 non-null object
text      5193 non-null object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [7]:
# checking for NULL values for each column
df.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [8]:
# Removing Rows with NULL values
df.dropna(subset = ["text"], inplace=True)

In [9]:
X = df['text']

In [11]:
def preprocess(data_X):
    # Replace email addresses with 'emailaddr'
    X = data_X.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')

    # Replace URLs with 'webaddr'
    X = X.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddr')

    # Replace Currency symbols with 'currsymb' 
    X = X.str.replace(r'£|\$|₹', 'currsymb')

    # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenbr'
    X = X.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenbr')

    # Replace numbers with 'numbr'
    X = X.str.replace(r'\d+(\.\d+)?', 'numbr')

    # Remove punctuation
    X = X.str.replace(r'[^\w\d\s]', ' ')

    # Replace whitespace between terms with a single space
    X = X.str.replace(r'\s+', ' ')

    # Remove leading and trailing whitespace
    X = X.str.replace(r'^\s+|\s+?$', '')

    # To lowercase
    X = X.str.lower()

    

    # Stemming words (removing ing, ed ...)
    # ps = nltk.PorterStemmer()
    # X = X.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

    return X



# Call the Preprocess function
X_preprocessed = preprocess(X)

In [12]:
# Loading our model
MODEL_FILE_PATH = r"../saved_model/model1.pickle"
model = pickle.load(open(MODEL_FILE_PATH, 'rb'))

In [13]:
predictions = model.predict(X_preprocessed)

In [14]:
predictions[:10]

array([0, 1, 1, 0, 1, 1, 0, 0, 0, 0], dtype=int64)