In [1]:
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('df.csv')
le = LabelEncoder()

In [3]:
df

Unnamed: 0,value,status
0,"grants you a royalty-free, non-exclusive, non-...",No
1,"BitTorrent, Inc. disclaims any responsibility ...",No
2,Mobile Spy Apps or Spyware Apps are smartphone...,No
3,We collect offline or on any other Company app...,No
4,Kik and our affiliated companies including Kin...,No
...,...,...
114,e. Reseller Administrator Access and Customer ...,Yes
115,"To the extent permitted by applicable law, the...",Yes
116,It is your responsibility to protect your onli...,Yes
117,Firefox Monitor is an informational service to...,Yes


In [4]:
data = df.to_numpy()

In [6]:
X = data[:, 0]
y = data[:, 1]

In [12]:
X.shape , y.shape

((119,), (119,))

In [9]:
tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [None]:
import re

In [17]:
X[0]

'grants you a royalty-free, non-exclusive, non-transferable license to use the Software, solely for non-commercial purposes. BitTorrent'

In [13]:
def getStem(review):
    review = review.lower()
    tokens = tokenizer.tokenize(review) # breaking into small words
    removed_stopwords = [w for w in tokens if w not in sw]   
    
    correct = []
    for ch in removed_stopwords:
        if ('\u0600' <= ch <= '\u06FF' or
    '\u0750' <= ch <= '\u077F' or
    '\u08A0' <= ch <= '\u08FF' or
  '\uFB50' <= ch <= '\uFDFF' or
    '\uFE70' <= ch <= '\uFEFF' or
    '\U00010E60' <= ch <= '\U00010E7F'): 
            pass
        else:
            correct.append(ch)
            
    stemmed_words = [ps.stem(token) for token in correct]
    clean_review = ' '.join(stemmed_words)
    return clean_review


In [14]:
# get a clean document
def getDoc(document):
    d = []
    for doc in document:
        d.append(getStem(doc))
    return d

In [15]:
stemmed_doc=getDoc(X)

In [28]:
stemmed_doc[118]

'order use servic need creat firefox account registr ask set password respons keep password confidenti activ happen firefox account mozilla respons loss aris unauthor use firefox account mozilla mozilla mozilla'

In [18]:
X[0]

'grants you a royalty-free, non-exclusive, non-transferable license to use the Software, solely for non-commercial purposes. BitTorrent'

'laugher'

In [26]:
cv = CountVectorizer()

In [29]:
vc = cv.fit_transform(stemmed_doc)

In [41]:
X = vc.todense()

In [64]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=.1, random_state=42)

In [65]:
from sklearn.naive_bayes import MultinomialNB

In [66]:
model = MultinomialNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9166666666666666

In [67]:
y_test[:5]

array(['No', 'Yes', 'No', 'No', 'No'], dtype=object)

In [68]:
y[:5]


array(['No', 'No', 'No', 'No', 'No'], dtype=object)

In [74]:
messages = [
    """
    Kik and our affiliated companies including Kin Ecosystem (“Company” or “We” or “Kik”) respect your privacy and are committed to protecting it through our compliance with this policy. This policy describes:

The types of information we may collect or that you may provide when you download, install, register with, access, use the Kik Messenger (the “App”), access our website at www.kik.com (the “Website”), or use our services, including the integration of the Kin Marketplace (i.e., the marketplace made available by the Kin Ecosystem for discovering opportunities to earn and spend Kin currency).
Our practices for collecting, using, maintaining, protecting, and disclosing that information.
This policy applies only to information we collect in this App and in email, text, and other electronic communications sent through or in connection with this App or the Website.

This policy DOES NOT apply to information that:

We collect offline or on any other Company apps or websites, including websites you may access through this App except the Website at www.kik.com.
You provide to or is collected by any third party (see Third-Party Information Collection).
Our other websites and apps, and these other third parties may have their own privacy policies, which we encourage you to read before providing information on or through them.

Please read this policy carefully to understand our policies and practices regarding your information and how we will treat it. If you do not agree with our policies and practices, do not download, register with, or use this App. By downloading, registering with, or using this App, or using or accessing the Website, you agree to this privacy policy. This policy may change from time to time (see Changes to Our Privacy Policy). Your continued use of this App or Website after we revise this policy means you accept those changes, so please check the policy

    """

]

In [75]:
def prepare(messages):
    d = getDoc(messages)
    # dont do fit_transform!! it will create new vocab.
    return cv.transform(d)

messages = prepare(messages)

In [76]:
y_pred = model.predict(messages)

In [77]:
y_pred

array(['No'], dtype='<U3')