In [632]:
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [633]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
le = LabelEncoder()

In [634]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [635]:
data = df.to_numpy()

In [636]:
data

array([['ham',
        'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
        nan, nan, nan],
       ['ham', 'Ok lar... Joking wif u oni...', nan, nan, nan],
       ['spam',
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
        nan, nan, nan],
       ...,
       ['ham',
        'Pity, * was in mood for that. So...any other suggestions?', nan,
        nan, nan],
       ['ham',
        "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
        nan, nan, nan],
       ['ham', 'Rofl. Its true to its name', nan, nan, nan]], dtype=object)

In [637]:
X = data[:, 1]
y = data[:, 0]

In [638]:
X.shape, y.shape

((5572,), (5572,))

In [639]:
X , y

(array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
        'Ok lar... Joking wif u oni...',
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
        ..., 'Pity, * was in mood for that. So...any other suggestions?',
        "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
        'Rofl. Its true to its name'], dtype=object),
 array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object))

In [640]:
tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [641]:
# get a clean document
def getDoc(document):
    d = []
    for doc in document:
        d.append(getStem(doc))
    return d

In [642]:
stemmed_doc = getDoc(X)

In [643]:
stemmed_doc[:10]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send å 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea å 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030']

In [644]:
cv = CountVectorizer()

In [645]:
# create my vocab
vc = cv.fit_transform(stemmed_doc)

In [646]:
X = vc.todense()

In [647]:
X

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [648]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [649]:
# NB from sklearn

In [650]:
from sklearn.naive_bayes import MultinomialNB

In [651]:
model = MultinomialNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.977705274605764

In [652]:
messages = [
    """
    Hi Kunal,
We invite you to participate in MishMash - India’s largest online diversity hackathon. 
The hackathon is a Skillenza initiative and sponsored by Microsoft, Unity, Unilever, Gojek, Rocketium and Jharkhand Government. 
We have a special theme for you - Deep Tech/Machine Learning - sponsored by Unilever, which will be perfect for you.
    """,
    """Join us today at 12:00 PM ET / 16:00 UTC for a Red Hat DevNation tech talk on AWS Lambda and serverless Java with Bill Burke.
Have you ever tried Java on AWS Lambda but found that the cold-start latency and memory usage were far too high? 
In this session, we will show how we optimized Java for serverless applications by leveraging GraalVM with Quarkus to 
provide both supersonic startup speed and a subatomic memory footprint.""",

    """We really appreciate your interest and wanted to let you know that we have received your application.
There is strong competition for jobs at Intel, and we receive many applications. As a result, it may take some time to get back to you.
Whether or not this position ends up being a fit, we will keep your information per data retention policies, 
so we can contact you for other positions that align to your experience and skill set.
"""
]

In [596]:
def prepare(messages):
    d = getDoc(messages)
    # dont do fit_transform!! it will create new vocab.
    return cv.transform(d)

messages = prepare(messages)

In [597]:
y_pred = model.predict(messages)
y_pred

array(['ham', 'spam', 'ham'], dtype='<U4')

In [598]:
ps.stem('laughing')

'laugh'

In [599]:
import pickle


In [600]:
with open('model_picke','wb') as f:
    pickle.dump(model,f)

In [601]:
with open('model_picke','rb') as f:
    model1 = pickle.load(f)

In [602]:
pq=model1.predict(messages)

In [603]:
pq


array(['ham', 'spam', 'ham'], dtype='<U4')

In [604]:
# def getStem(review):
#     review = review.lower()
#     tokens = tokenizer.tokenize(review) # breaking into small words
#     removed_stopwords = [w for w in tokens if w not in sw]
#     stemmed_words = [ps.stem(token) for token in removed_stopwords]
#     clean_review = ' '.join(stemmed_words)
#     return clean_review


In [605]:
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [606]:
df = pd.read_csv('test.csv')
le = LabelEncoder()

In [607]:
df

Unnamed: 0,id,"""tweet""",class
0,717464912415600640,"""ياثارات الحسين""\n.\nالى متى تكف المجزرة الظال...",1
1,794412494077001729,mujahids dont die but in numbers we multiply i...,1
2,563458330019258369,"RT @AbuhAlam: @InviteToIslam well, it was ""whe...",1
3,665946450124103680,#دولة_الخلافة \n#ولاية_الفرات\n #عاجل\nإعطاب ه...,1
4,734350218314063872,RT @K_Dergi3: Suhne Fatihi\nŞehid Şeyh Ebu Mal...,1
...,...,...,...
95,711146574399852544,@CDIMI landen of mensen moeten geen kopie word...,0
96,565480087584518145,RT @DailyNewsEgypt: Canadian PM calls for Moha...,0
97,561897149487538176,@Anaminona thanks.,0
98,663842492094259200,When I told them they are denigrating themselv...,0


In [608]:
data = df.to_numpy()

In [609]:
Y = data[:, 2]
X = data[:, 1] 

In [610]:
y = []
for i in Y:
    if i == 1:
        y.append("Yes")
    else:
        y.append("No")

In [611]:
import re

In [612]:
def getStem(review):
    review = review.lower()
    tokens = tokenizer.tokenize(review) # breaking into small words
    removed_stopwords = [w for w in tokens if w not in sw]   
    
    correct = []
    for ch in removed_stopwords:
        if ('\u0600' <= ch <= '\u06FF' or
    '\u0750' <= ch <= '\u077F' or
    '\u08A0' <= ch <= '\u08FF' or
  '\uFB50' <= ch <= '\uFDFF' or
    '\uFE70' <= ch <= '\uFEFF' or
    '\U00010E60' <= ch <= '\U00010E7F'): 
            pass
        else:
            correct.append(ch)
    stemmed_words = [ps.stem(token) for token in correct]
    clean_review = ' '.join(stemmed_words)
    return clean_review


In [613]:
# get a clean document
def getDoc(document):
    d = []
    for doc in document:
        d.append(getStem(doc))
    return d

In [614]:
stemmed_doc = getDoc(X)

In [615]:
stemmed_doc

['http co teteqc258u',
 'mujahid dont die number multipli insha allah oh allah enrich us ikhla taqwa ridaka rahmatika',
 'rt abuhalam invitetoislam well coconut tri blend kuffar',
 'http co vy740vv27r http co b5zgem1exa',
 'rt k_dergi3 suhn fatihi şehid şeyh ebu malik et temimi allah kabul etsin suhn ameliyesi 6 http co t0p8ewyggo',
 'rt azamhikma http co f8vxjpl83j',
 'iloveafrica_ waleedkona http co xh3qlowpb2',
 'shall come upon peopl time wherein shall believ go ash shaam 1',
 'spanishkash1 iqy007 http co laaysykm5',
 'rt bsmtlolo1 http co wsfirbmwi',
 'aqap yemen lenient rule neglect fall short limit set allah w',
 'rt mecomuhammad made logo invitetoislam sure follow account great islam remind http co opqda24wxz',
 'alhamdulilaah 3ala kulli 7aal sujudshukr allah ya ikhwa bless day rememb brother dua qiyam layl',
 'rt qatada_92',
 'asd1410 dzdzdz32111 hino38 kasimf',
 'rt freefalasteen firebomb pig head thrown mosqu anti muslim attack increas pari shoot http co 9txzz3kwji',
 'democ

In [616]:
cv = CountVectorizer()

In [617]:
vc = cv.fit_transform(stemmed_doc)

In [618]:
X = vc.toarray()

In [619]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [620]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [621]:
from sklearn.naive_bayes import MultinomialNB

In [622]:
model = MultinomialNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7272727272727273

In [623]:
from sklearn.linear_model import LogisticRegression

In [624]:
model1 = LogisticRegression()

In [625]:
model1.fit(X_train, y_train)
model1.score(X_test, y_test)

0.6666666666666666

In [629]:
messages = [
    """
    Hi Kunal,
We invite you to participate in MishMash - India’s largest online diversity hackathon. 
The hackathon is a Skillenza initiative and sponsored by Microsoft, Unity, Unilever, Gojek, Rocketium and Jharkhand Government. 
We have a special theme for you - Deep Tech/Machine Learning - sponsored by Unilever, which will be perfect for you.
    """
]

In [630]:
def prepare(messages):
    d = getDoc(messages)
    # dont do fit_transform!! it will create new vocab.
    return cv.transform(d)

messages = prepare(messages)

['hi kunal invit particip mishmash india largest onlin divers hackathon hackathon skillenza initi sponsor microsoft uniti unilev gojek rocketium jharkhand govern special theme deep tech machin learn sponsor unilev perfect']


In [631]:
messages

<1x724 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>