# Importing Libraries and reading data 

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import nltk  
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv('/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv')
data

# Data Insights

In [None]:
text = data['text']
text

In [None]:
text[0]

In [None]:
text[3]

In [None]:
print("\n"+'-'*25)
print("shape of data:",data.shape) 
print("\n"+'-'*25)
print("\nno dimensions of data:",data.ndim)
print("\n"+'-'*25)
print("\nsize of data:",data.size)
print("\n"+'-'*25)
print("\nSum fo all null values:\n",data.isnull().sum())
print("\n"+'-'*25)

In [None]:
print("Column Names : \n"+'-'*25)
print(data.columns)

In [None]:
print("Unique values in every column \n"+'-'*25)
data.nunique()

In [None]:
print("summary of a DataFrame:",data.info())

In [None]:
display("Top 5 rows :",data.head())
display("Last 5 Rows :",data.tail()) 

# Some Visualizations

In [None]:
a = data['label_num'].value_counts()
a.plot(kind="bar")
plt.xticks(np.arange(2), ('Non spam', 'spam'),rotation=0)
plt.show()

In [None]:
import re
corpus = []
length = len(data)
for i in range(0,length):
    text = re.sub("[^a-zA-Z0-9]"," ",data["text"][i])
    text = text.lower()
    text = text.split()
    pe = PorterStemmer()
    stopword = stopwords.words("english")
    text = [pe.stem(word) for word in text if not word in set(stopword)]
    text = " ".join(text)
    corpus.append(text)
print(corpus[0])

In [None]:
all_mails = ''
for i in corpus:
    all_mails=all_mails+i

In [None]:
from wordcloud import WordCloud, STOPWORDS


wordcloud = WordCloud(width = 800, height = 800,background_color ='white',
                        min_font_size = 10).generate(all_mails)

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()


# Modeling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=35000)
X = cv.fit_transform(corpus).toarray()

In [None]:
y=data['label_num']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

model.fit(X_train, y_train)

y_pred=model.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
score = accuracy_score(y_test,y_pred)
print(cm,score*100)

In [None]:
'''
import pickle 
pickle.dump(cv, open('cv.pkl', 'wb'))
pickle.dump(model, open('model.pkl', 'wb'))
'''

# Check the result with custom input

In [None]:
def new_review(new_review):
    new_review = new_review
    new_review = re.sub('[^a-zA-Z]', ' ', new_review)
    new_review = new_review.lower()
    new_review = new_review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    new_review = [ps.stem(word) for word in new_review if not word in   set(all_stopwords)]
    new_review = ' '.join(new_review)
    new_corpus = [new_review]
    new_X_test = cv.transform(new_corpus).toarray()
    new_y_pred = model.predict(new_X_test)
    return new_y_pred
#new_review = new_review(str(input("Enter new review...")))
new_review = new_review("Thanks for your time.")
if new_review[0]==1:
  print("SPAM")
else :
  print("NOT SPAM")

# Thanks for your time. 

### If You Liked the Kernel drop an Upvote :D

## Any suggetion regarding correction or improvement will be highly appreciated :)