In [3]:
import pandas as pd
pd.set_option('max_colwidth',200)

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier , LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
df = pd.read_csv('SMSSpamCollection',sep='\t',header=None,names=['label','text']) 
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [5]:
df.shape

(5572, 2)

In [6]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
df['label']=df['label'].map({'ham':1,'spam':0})

In [8]:
df.head(8)

Unnamed: 0,label,text
0,1,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives around here though"
5,0,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
6,1,Even my brother is not like to speak with me. They treat me like aids patent.
7,1,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune


In [9]:
#Text Preprocessing
#lowering text
df['text']=df['text'].str.lower()
#removing email address
df['text']=df['text'].replace('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)
#remove IP address
df['text'] = df['text'].replace('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex = True)

#remove punctaitions and special chracters
df['text'] = df['text'].str.replace('[^\w\s]','' , regex = True)

#remove numbers
df['text'] = df['text'].replace('\d', '', regex=True)

In [10]:
df.head(7)

Unnamed: 0,label,text
0,1,go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
1,1,ok lar joking wif u oni
2,0,free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry questionstd txt ratetcs apply overs
3,1,u dun say so early hor u c already then say
4,1,nah i dont think he goes to usf he lives around here though
5,0,freemsg hey there darling its been weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send to rcv
6,1,even my brother is not like to speak with me they treat me like aids patent


In [11]:
#remove stop words
for index, row in df.iterrows():
    word_tokens = word_tokenize(row['text'])
    filtered_sentence = [w for w in word_tokens if not w in stopwords.words('english')]
    df.at[index , 'text'] = " ".join(filtered_sentence[0:])

In [12]:
df.head(5)

Unnamed: 0,label,text
0,1,go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,1,ok lar joking wif u oni
2,0,free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply overs
3,1,u dun say early hor u c already say
4,1,nah dont think goes usf lives around though


In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['text'] , df['label'] , test_size = 0.10, random_state=0)

In [14]:
len(X_train)

5014

In [15]:
vectorizer = TfidfVectorizer(analyzer = 'word' , ngram_range = (1,2),stop_words = {'english'}) 

X_train_tfidf = vectorizer.fit_transform(X_train)

X_test_tfidf = vectorizer.transform(X_test)

In [22]:
X_train_tfidf.shape , y_train.shape

((5014, 36245), (5014,))

In [16]:
classifier = SGDClassifier(alpha=1e-05 , max_iter=50 , penalty = 'elasticnet')
logre = LogisticRegression(solver='lbfgs')
nb = MultinomialNB()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()

In [17]:
classifier.fit(X_train_tfidf, y_train)

logre.fit(X_train_tfidf, y_train)

nb.fit(X_train_tfidf, y_train)

rf.fit(X_train_tfidf, y_train)

knn.fit(X_train_tfidf, y_train)

KNeighborsClassifier()

In [18]:
predictions = classifier.predict(X_test_tfidf)

log_pred = logre.predict(X_test_tfidf)

nb_pred = nb.predict(X_test_tfidf)

rf_pred = rf.predict(X_test_tfidf)

knn_pred = knn.predict(X_test_tfidf)

In [19]:

print( classifier, '\n' , classification_report(y_test, predictions) , '\n\n')

print( logre, '\n' , classification_report(y_test, log_pred) , '\n\n')

print( nb, '\n' , classification_report(y_test, nb_pred) , '\n\n')

print( rf, '\n' , classification_report(y_test, rf_pred) , '\n\n')

print( knn, '\n' , classification_report(y_test, knn_pred) , '\n\n')

SGDClassifier(alpha=1e-05, max_iter=50, penalty='elasticnet') 
               precision    recall  f1-score   support

           0       0.96      0.91      0.93        77
           1       0.99      0.99      0.99       481

    accuracy                           0.98       558
   macro avg       0.97      0.95      0.96       558
weighted avg       0.98      0.98      0.98       558
 


LogisticRegression() 
               precision    recall  f1-score   support

           0       0.98      0.65      0.78        77
           1       0.95      1.00      0.97       481

    accuracy                           0.95       558
   macro avg       0.96      0.82      0.88       558
weighted avg       0.95      0.95      0.95       558
 


MultinomialNB() 
               precision    recall  f1-score   support

           0       1.00      0.71      0.83        77
           1       0.96      1.00      0.98       481

    accuracy                           0.96       558
   macro avg     

In [45]:
nb.predict(X)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 36245 is different from 17)

In [47]:
X.shape,X_test_tfidf.shape

((1, 17), (558, 36245))

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X=cv.fit_transform(text)

In [36]:
cv = CountVectorizer()

X=cv.fit(text)
X.vocabulary_
X.get_feature_names()

['9600',
 'arrested',
 'bail',
 'grandson',
 'immediately',
 'in',
 'last',
 'mexico',
 'money',
 'need',
 'night',
 'union',
 'urgent',
 'was',
 'western',
 'wire',
 'your']

In [38]:
X = cv.fit_transform(text).toarray()
X

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [1]:
import pickle

In [23]:
file=open('MB_Model.pkl','wb')
pickle.dump(rf,file)
file.close()

In [25]:
file=open('MB_Model.pkl','rb')
rf_1=pickle.load(file)
rf_1

RandomForestClassifier()

In [42]:
text = ["URGENT your grandson was arrested last night in Mexico.Need Bail Money immediately Western Union Wire $9600"]

In [43]:
test = vectorizer.fit_transform(text)

In [39]:
rf_pred = rf_1.predict(X)
rf_pred

ValueError: X has 17 features, but DecisionTreeClassifier is expecting 36245 features as input.