In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_colwidth', 100)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reading the data

df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv")

# NAN value checking 

df_nan_count = pd.DataFrame(df.isnull().sum())
df_nan_count = df_nan_count.reset_index()
df_nan_count.columns = ["colname","count of null value"]
display(df_nan_count)

In [None]:
# taking only relevant columns
df = df[['v1', 'v2']]

In [None]:
df.head()

In [None]:
# % of label
pd.DataFrame(df["v1"].value_counts() / df["v1"].count().sum() * 100 )
# 86 % ham and 13 % spam

In [None]:
# What is the shape of the dataset?

print("Input data has {} rows and {} columns".format(len(df), len(df.columns)))

In [None]:
df.columns = ['label', 'body_text']

In [None]:
# How many spam/ham are there?

print("Out of {} rows, {} are spam, {} are ham".format(len(df),
                                                       len(df[df['label']=='spam']),
                                                       len(df[df['label']=='ham'])))

In [None]:
# How much missing data is there?

print("Number of null in label: {}".format(df['label'].isnull().sum()))
print("Number of null in text: {}".format(df['body_text'].isnull().sum()))

In [None]:
import nltk
nltk.download('stopwords')
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()



def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100


def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text



In [None]:
df['body_clean_text'] = df['body_text'].apply(lambda x: clean_text(x))
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))


In [None]:
from matplotlib import pyplot
import numpy as np
%matplotlib inline
bins = np.linspace(0, 200, 40)

pyplot.hist(df[df['label']=='spam']['body_len'], bins, alpha=0.5, density = True, label='spam')
pyplot.hist(df[df['label']=='ham']['body_len'], bins, alpha=0.5, density = True, label='ham')
pyplot.legend(loc='upper left')
pyplot.show()
#### spam are more lengthy than ham

#### spam are more lengthy than ham

In [None]:
bins = np.linspace(0, 50, 40)

pyplot.hist(df[df['label']=='spam']['punct%'], bins, alpha=0.5, density = True, label='spam')
pyplot.hist(df[df['label']=='ham']['punct%'], bins, alpha=0.5, density = True, label='ham')
pyplot.legend(loc='upper right')
pyplot.show()

# checking if punctuation can be a feature of creating this model or not

In [None]:
#### 

#### creating TF-IDf and Document term matrix both to compare our model

In [None]:


# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['body_text'])
X_tfidf_feat = pd.concat([df['body_len'], df['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(df['body_text'])
X_count_feat = pd.concat([df['body_len'], df['punct%'], pd.DataFrame(X_count.toarray())], axis=1)

X_count_feat.head()

In [None]:
df.head()

#### Using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
print(dir(RandomForestClassifier))
print(RandomForestClassifier()) # exploring hyperparameters

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_feat, df['label'], test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [None]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:10]

In [None]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

In [None]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

#### Using Grid Search 

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, df['label'])
display(pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5])

y_pred = gs_fit.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

In [None]:
#### 

#### Using XG boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
print(dir(GradientBoostingClassifier))
print(GradientBoostingClassifier())

In [None]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150], 
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_tfidf_feat, df['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

In [None]:
y_pred = cv_fit.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))