In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('../input/sms-spam-collection-data-set/SMSSpamCollection',sep='\t',names=['label','text'])

In [None]:
df

# Data Pre-processing

In [None]:
df.shape

In [None]:
import nltk #!pip install nltk

In [None]:
nltk.download('stopwords')

In [None]:
sent = 'How are you friends?'

In [None]:
from nltk.tokenize import word_tokenize
word_tokenize(sent)

In [None]:
from nltk.corpus import stopwords
swords = stopwords.words('english')

In [None]:
clean = [word for word in word_tokenize(sent) if word not in swords]

In [None]:
clean

In [None]:
# Stemming words with NLTK
from nltk.stem import PorterStemmer
ps = PorterStemmer()
clean = [ps.stem(word) for word in word_tokenize(sent) 
         if word not in swords]
clean

In [None]:
sent = 'Hello friends! How are you? We will learning python today'

In [None]:
def clean_text(sent):
    tokens = word_tokenize(sent)
    clean = [word for word in tokens if word.isdigit() or word.isalpha()]
    clean = [ps.stem(word) for word in clean
         if word not in swords]
    return clean

In [None]:
clean_text(sent)

In [None]:
# Pre-processing 
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(analyzer=clean_text)

In [None]:
x = df['text']
y = df['label']

In [None]:
x_new = tfidf.fit_transform(x)

In [None]:
x.shape

In [None]:
x_new.shape

In [None]:
# tfidf.get_feature_names()

In [None]:
import seaborn as sns
sns.countplot(x=y)

In [None]:
#cross validation
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_new,y,test_size=0.25,random_state=1)

In [None]:
print(f"Size of splitted data")
print(f"x_train {x_train.shape}")
print(f"y_train {y_train.shape}")
print(f"y_test {x_test.shape}")
print(f"y_test {y_test.shape}")

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()
nb.fit(x_train.toarray(),y_train)
y_pred_nb = nb.predict(x_test.toarray())

In [None]:
y_test.value_counts()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_nb)
plt.title('Naive bayes')
plt.show()
print(f" Accuracy is {accuracy_score(y_test,y_pred_nb)}")
print(classification_report(y_test,y_pred_nb))

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(random_state=1)
model_rf.fit(x_train,y_train)


In [None]:
y_pred_rf = model_rf.predict(x_test) #float

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_rf)
plt.title('Random Forest')
plt.show()
print(f" Accuracy is {accuracy_score(y_test,y_pred_rf)}")
print(classification_report(y_test,y_pred_rf))

In [None]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression(random_state=1)

model_lr.fit(x_train,y_train)
y_pred_lr = model_lr.predict(x_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_lr)
plt.title('Logistic regression')
plt.show()
print(f" Accuracy is {accuracy_score(y_test,y_pred_lr)}")
print(classification_report(y_test,y_pred_lr))

# Hyper parameter tunning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
para = {
    
    'criterion':['gini', 'entropy','log_loss'],
#     'max_features': ['sqrt','log2'],
#     'random_state': [0,1,2,3,4],
    'class_weight':['balanced','balanced_subsample']
}

In [None]:
grid = GridSearchCV(model_rf, param_grid=para, cv=5, scoring='accuracy')

In [None]:
grid.fit(x_train,y_train)

In [None]:
rf = grid.best_estimator_

In [None]:
y_pred_grid = rf.predict(x_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_grid)
plt.title('Gride Search')
plt.show()
print(f" Accuracy is {accuracy_score(y_test,y_pred_grid)}")
print(classification_report(y_test,y_pred_grid))