In [9]:
import pandas as pd

In [10]:
df = pd.read_csv("message.csv")

In [11]:
df.shape

(5572, 2)

In [12]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Preprocess

In [17]:
a = df.loc[2]['message']
a

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [16]:
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()
corpus=[]
for i in range(0,len(df)):
    message = re.sub('[^a-zA-Z]',' ',df['message'][i])
    message = message.lower()
    message = message.split()
    message = [ps.stem(word) for word in message if word not in set(stopwords.words('english'))]
    message = ' '.join(message)
    corpus.append(message)

In [5]:
df.isnull().sum()

label      0
message    0
dtype: int64

In [6]:
x = df['message']
y = df['label']

In [7]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 0)

In [8]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((3900,), (1672,), (3900,), (1672,))

## Feature Extraction

### Bag Of Words

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

In [21]:
X_train_counts.shape,X_test_counts .shape

((3900, 7314), (1672, 7314))

### TF-IDF

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_counts)
X_test_tfidf = tfidf.transform(X_test_counts)

In [23]:
X_train_tfidf.shape,X_test_tfidf.shape

((3900, 7314), (1672, 7314))

## Tf-Idf vectorizer
- it combine both bagofword and tfidf transformer

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)

In [14]:
X_train_vect.shape

(3900, 7314)

### Here I Can use Either X_test_tfidf Or X_train_vect

### Model Build

## Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train_tfidf,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = clf.predict(X_test_tfidf)
accuracy_score(y_test,y_pred)

0.9706937799043063

In [40]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1451
        spam       0.99      0.79      0.88       221

    accuracy                           0.97      1672
   macro avg       0.98      0.89      0.93      1672
weighted avg       0.97      0.97      0.97      1672



## SVM

In [43]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_tfidf,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [44]:
y_pred = svc.predict(X_test_tfidf)
accuracy_score(y_test,y_pred)

0.9832535885167464

In [45]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1451
        spam       1.00      0.87      0.93       221

    accuracy                           0.98      1672
   macro avg       0.99      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



## KNN

In [46]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_tfidf,y_train)

In [47]:
y_pred = knn.predict(X_test_tfidf)
accuracy_score(y_test,y_pred)

0.9641148325358851

In [48]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1451
        spam       1.00      0.73      0.84       221

    accuracy                           0.96      1672
   macro avg       0.98      0.86      0.91      1672
weighted avg       0.97      0.96      0.96      1672



## MultinomialNB

In [49]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_tfidf,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [50]:
y_pred = mnb.predict(X_test_tfidf)
accuracy_score(y_test,y_pred)

0.9611244019138756

In [51]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1451
        spam       1.00      0.71      0.83       221

    accuracy                           0.96      1672
   macro avg       0.98      0.85      0.90      1672
weighted avg       0.96      0.96      0.96      1672



In [59]:
strr = input("Enter a Message: ")
examples = [strr]
example_counts =count_vect.transform(examples)
example_counts =tfidf.transform(example_counts)
prediction =svc.predict(example_counts)
prediction[0]

Enter a Message: you won 2000 cash hurray


'spam'

In [60]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier().fit(X_train_tfidf,y_train)

In [61]:
y_pred = lgb.predict(X_test_tfidf)
accuracy_score(y_test,y_pred)

0.979066985645933

In [62]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1451
        spam       0.97      0.86      0.92       221

    accuracy                           0.98      1672
   macro avg       0.98      0.93      0.95      1672
weighted avg       0.98      0.98      0.98      1672

