In [12]:
import pickle
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
from nltk import word_tokenize
import string
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import pandas as pd

In [24]:
with open('regModel','rb') as file:
    model = pickle.load(file)

In [3]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [4]:
def remove_url(text):
    # remove urls
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [5]:
def remove_html(w):
    soup = BeautifulSoup(w)
    text = soup.get_text()
    return w

In [6]:
def cleanData(data):
    # remove urls
    data['text'] = data['text'].apply(lambda x:remove_url(x))
    print('urls removed')
    
    # remove emojis
    data['text'] = data['text'].apply(lambda x:remove_emoji(x))
    print('emojis removed')
    
    # tokenizing words
    data['text'] = data['text'].apply(lambda x:word_tokenize(x))
    print('tokenization done')
    
    # convert all text to lowercase
    data['text'] = data['text'].apply(lambda x:[w.lower() for w in x ])
    print('lowercase done')
    
    # remove html tags
    data['text'] = data['text'].apply(lambda x:[remove_html(w) for w in x])
    print('html tags removed')
    
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    
    
    # removing puncutations
    data['text'] = data['text'].apply(lambda x:[re_punc.sub('',w) for w in x])
    print('punctuations removed')
    
    # removing non alphabetic words 
    data['text'] = data['text'].apply(lambda x:[w for w in x if w.isalpha()])
    print('numeric removed')
    
    # removing stopwords
    data['text'] = data['text'].apply(lambda x:[w for w in x if w not in stopwords.words('english')])
    print('stopwords removed')
    
    # removing short words
    data['text'] = data['text'].apply(lambda x:[w for w in x if len(w)>2])
    print('shortwords removed')
    
    return data

In [27]:
dataTrial = pd.DataFrame(['I am feeling lucky today'],columns = ['text'])
dataTrial = cleanData(dataTrial)

urls removed
emojis removed
tokenization done
lowercase done
html tags removed
punctuations removed
numeric removed
stopwords removed
shortwords removed


In [28]:
lem = WordNetLemmatizer()

In [29]:
dataTrial['text'] = dataTrial['text'].apply(lambda x:[lem.lemmatize(w) for w in x])
dataTrial['text'] = dataTrial['text'].apply(lambda x:' '.join(x))

In [30]:
dataTrial

Unnamed: 0,text
0,feeling lucky today


In [31]:
with open('tfidfFit','rb') as file:
    tfidf = pickle.load(file)

In [32]:
Xtest = dataTrial['text']
Xtest = tfidf.transform(Xtest)

In [33]:
Xtest = Xtest.toarray()

In [34]:
Xtest.shape

(1, 14403)

In [35]:
y_pred = model.predict(Xtest)

In [36]:
y_pred

array([0], dtype=int64)