## **BOW,TFIDF,Machine Learning Algorithms**
* Preprocessing And Cleaning
* Train Test Split
* BOW And TF-IDF (Sentences--->vectors) {Preventing Data Leakage}
* Trained Our Models

In [None]:
import pandas as pd

# Path to the downloaded file
file_path = '/content/drive/MyDrive/Data Science - GT/Natural Language Processing/SMSSpamCollection.txt'

# Load the file with options to handle inconsistent data
df = pd.read_csv(
    file_path,
    sep='\t',
    names= ["label", "messages"])

# Show the first few rows of the DataFrame
print(df.head())

  label                                           messages
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
df

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
## Data Cleaning And Preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()


In [None]:
corpus=[]
for i in range(0,len(df)):
    review=re.sub('[^a-zA-z]',' ',df['messages'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [None]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

## **Create Bag Of Words**

In [None]:
## Output Features
y=pd.get_dummies(df['label'])
y=y.iloc[:,0].values

y

array([ True,  True, False, ...,  True,  True,  True])

In [None]:
X = corpus
X

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

In [None]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [None]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=2500,ngram_range=(1,2))

In [None]:
len(X_train) , len(y_train)

(4457, 4457)

In [None]:
# Assuming 'corpus' is your list of pre-processed text data
X = corpus  # Use the processed corpus as features

# Output Features
y = pd.get_dummies(df['label'])
y = y.iloc[:, 0].values

# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# Create the Bag Of Words model
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer for the Bag of Words model
cv = CountVectorizer(max_features=2500, ngram_range=(1, 2), binary=True)  # Enable binary=True for binary BoW

# Fit and transform the training data into the Bag of Words representation
X_train_bow = cv.fit_transform(X_train).toarray()  # Use fit_transform here

# Transform the test data (without fitting again)
X_test_bow = cv.transform(X_test).toarray()  # Transform test data

# Now, X_train_bow and X_test_bow contain the binary Bag of Words representation


In [None]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000,
    formatter=dict(float=lambda x: "%.3g" % x))
X_train

['hello handsom find job lazi work toward get back net mummi boytoy miss',
 'wanna gym harri',
 'hope feel great pl fill abiola',
 'appl day doctor tulsi leaf day cancer lemon day fat cup milk day bone problm litr watr day diseas snd th u care',
 'well done costa del sol holiday await collect call toclaim sae tc pobox stockport sk xh cost pm max min',
 'u secret admir reveal think u r special call opt repli reveal stop per msg recd cust care',
 'onlin transact',
 'hello thanx take call got job start monday',
 'winner special select receiv cash award speak live oper claim call pm cost p',
 'u still plumber tape wrench could borrow',
 'expect whenev text hope goe well tomo',
 'free entri weekli comp send word win c www txttowin co uk',
 'update_now mth half price orang line rental min call mobileupd call optout j q',
 'tddnewslett emc co uk game thedailydraw dear helen dozen free game great prizeswith',
 'u wake alreadi thanx e tau sar piah quit nice',
 'ok',
 'well obvious peopl cool co

In [None]:
cv.vocabulary_

{'hello': 931,
 'find': 684,
 'job': 1065,
 'lazi': 1137,
 'work': 2432,
 'toward': 2180,
 'get': 773,
 'back': 129,
 'net': 1407,
 'boytoy': 195,
 'miss': 1323,
 'get back': 774,
 'wanna': 2333,
 'gym': 898,
 'hope': 965,
 'feel': 669,
 'great': 858,
 'pl': 1571,
 'fill': 680,
 'abiola': 1,
 'day': 485,
 'doctor': 542,
 'cancer': 272,
 'fat': 664,
 'cup': 451,
 'th': 2100,
 'care': 279,
 'well': 2381,
 'done': 547,
 'costa': 418,
 'del': 506,
 'sol': 1921,
 'holiday': 955,
 'await': 117,
 'collect': 361,
 'call': 222,
 'toclaim': 2151,
 'sae': 1774,
 'tc': 2070,
 'pobox': 1599,
 'stockport': 1983,
 'sk': 1894,
 'xh': 2464,
 'cost': 415,
 'pm': 1594,
 'max': 1278,
 'min': 1307,
 'well done': 2382,
 'costa del': 419,
 'del sol': 507,
 'sol holiday': 1922,
 'holiday await': 956,
 'await collect': 118,
 'collect call': 362,
 'call toclaim': 253,
 'toclaim sae': 2152,
 'sae tc': 1776,
 'tc pobox': 2071,
 'pobox stockport': 1600,
 'stockport sk': 1984,
 'sk xh': 1896,
 'xh cost': 2465,
 'co

In [None]:
from sklearn.naive_bayes import MultinomialNB ## works well with Sparse data

spam_detect_model=MultinomialNB().fit(X_train_bow,y_train)
y_pred=spam_detect_model.predict(X_test_bow)
from sklearn.metrics import accuracy_score,classification_report
accuracy_score(y_test,y_pred)

0.9865470852017937

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.97      0.94      0.95       166
        True       0.99      0.99      0.99       949

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



## **Creating The TF-IDF Model**

In [None]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(corpus,y,test_size=0.20)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X_train_tfidf = tv.fit_transform(X_train).toarray()
X_test_tfidf = tv.transform(X_test).toarray()

In [None]:
X_train_tfidf

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0.53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.435, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 

In [None]:
tv.vocabulary_

{'bed': 155,
 'like': 1149,
 'lt': 1213,
 'gt': 857,
 'hour': 967,
 'like lt': 1154,
 'lt gt': 1215,
 'case': 298,
 'wake': 2331,
 'wonder': 2430,
 'forgot': 711,
 'take': 2058,
 'care': 294,
 'someth': 1930,
 'today': 2153,
 'done': 552,
 'take care': 2059,
 'yeah': 2479,
 'abl': 3,
 'text': 2093,
 'readi': 1692,
 'meet': 1276,
 'give': 787,
 'plu': 1590,
 'said': 1783,
 'greet': 852,
 'whenev': 2396,
 'speak': 1950,
 'need': 1406,
 'lar': 1098,
 'go': 795,
 'co': 361,
 'si': 1877,
 'somewher': 1933,
 'moon': 1352,
 'light': 1146,
 'someon': 1928,
 'think': 2116,
 'dream': 566,
 'come': 383,
 'true': 2197,
 'amp': 59,
 'sweet': 2051,
 'light someon': 1147,
 'amp sweet': 62,
 'sweet dream': 2052,
 'lover': 1208,
 'night': 1434,
 'end': 598,
 'anoth': 67,
 'day': 487,
 'morn': 1354,
 'special': 1953,
 'way': 2360,
 'may': 1267,
 'smile': 1914,
 'sunni': 2032,
 'ray': 1683,
 'leav': 1124,
 'worri': 2439,
 'blue': 185,
 'bay': 145,
 'gud': 870,
 'mrng': 1368,
 'night end': 1435,
 'end ano

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_tfidf_model = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
#prediction
y_pred=spam_tfidf_model.predict(X_test_tfidf)

In [None]:
score=accuracy_score(y_test,y_pred)
print(score)

0.9829596412556054


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       0.87      1.00      0.93       125
        True       1.00      0.98      0.99       990

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.96      1115
weighted avg       0.99      0.98      0.98      1115

