In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re

import matplotlib.pyplot as plt
from wordcloud import WordCloud
%matplotlib inline

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin-1' )

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df = df.filter(['v1', 'v2'])

In [None]:
df.head()

In [None]:
df.columns = ('label', 'text')

In [None]:
df.head()

In [None]:
df.label.unique()

In [None]:
df.groupby('label').describe()

In [None]:
df.label.value_counts().plot(kind='bar')

In [None]:
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].replace(r'[^A-Za-z0-9 ]+', '', regex=True)

In [None]:
df.head()

In [None]:
def clean_text(text, lemmatizer = WordNetLemmatizer(), 
                  stop_words = set(stopwords.words('english'))):
      
    words = word_tokenize(text)
    
    filtered_words = []
    
    for word in words:
        
        if word not in stop_words and word.isalpha():
            filtered_words.append(lemmatizer.lemmatize(word))
    
    return filtered_words

# Spam EDA

In [None]:
df_spam = df[df['label'] == 'spam']

In [None]:
df_spam.head()

In [None]:
spam_nested_list = df_spam['text'].apply(lambda x: clean_text(x))

In [None]:
spam_nested_list[:10]

In [None]:
word_length_spam = [len(w) for w in spam_nested_list]

In [None]:
plt.hist(word_length_spam, bins=25)
plt.title('WordLength for spam text')
plt.ylabel('count')
plt.xlabel('Length of text')
plt.grid()
plt.show()

In [None]:
df_spam.head()

In [None]:
words_spam = clean_text(''.join(str(df_spam['text'].tolist())))

In [None]:
words_spam[:15]

In [None]:
(pd.Series(nltk.ngrams(words_spam, 2)).value_counts())[:12]

In [None]:
(pd.Series(nltk.ngrams(words_spam, 3)).value_counts())[:12]

In [None]:
bigrams_series_spam = (pd.Series(nltk.ngrams(words_spam, 2)).value_counts())[:12]
trigrams_series_spam = (pd.Series(nltk.ngrams(words_spam, 3)).value_counts())[:12]

bigrams_series_spam.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('20 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# of Occurances')

In [None]:
word_cloud = WordCloud( background_color='white', colormap='winter')
word_cloud.generate(','.join(words_spam))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Ham EDA

In [None]:
df_ham = df[df['label'] == 'ham']

In [None]:
df_ham.sample(15)

In [None]:
ham_nested_list = df_ham['text'].apply(lambda x: clean_text(x))

In [None]:
ham_nested_list[:10]

In [None]:
word_length_ham = [len(w) for w in ham_nested_list]

plt.hist(word_length_ham, bins=25)
plt.title('WordLength for spam text')
plt.ylabel('count')
plt.xlabel('Length of text')
plt.grid()
plt.show()

In [None]:
words_ham = clean_text(''.join(str(df_ham['text'].tolist())))

In [None]:
(pd.Series(nltk.ngrams(words_ham, 2)).value_counts())[:12]

In [None]:
(pd.Series(nltk.ngrams(words_ham, 3)).value_counts())[:12]

In [None]:
bigrams_series_ham = (pd.Series(nltk.ngrams(words_ham, 2)).value_counts())[:12]
trigrams_series_ham = (pd.Series(nltk.ngrams(words_ham, 3)).value_counts())[:12]

bigrams_series_ham.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('20 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# of Occurances')

In [None]:
word_cloud = WordCloud( background_color='white', colormap='winter')
word_cloud.generate(','.join(words_ham))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# PREDICTING

In [None]:
vectorizer = CountVectorizer(stop_words='english')

In [None]:
X = df['text']
y = df['label']

In [None]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.3)

In [None]:
X_vect = vectorizer.fit_transform(X_train)

In [None]:
nb = MultinomialNB()

nb.fit(X_vect,y_train)
y_pred = nb.predict(vectorizer.transform(X_test))

In [None]:
print(accuracy_score(y_test,y_pred))