In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding='latin-1')

In [None]:
data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'] , inplace = True)

In [None]:
data.rename(columns={'v1':'label',
                     'v2':'message'}, inplace = True)

In [None]:
data.head()

In [None]:
def label_coding(row):
    if row =="ham":
        return 0
    return 1

In [None]:
data['label'] = data['label'].apply(label_coding)

In [None]:
data['label'].value_counts().plot(kind='bar')

In [None]:
import string
punc=string.punctuation

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def pre_processing(row):
    #converting to lowercase
    _row=row.lower()
    #Removing Punctuation
    _row="".join([x for x in _row if x not in punc])
    #Removing stopwords
    _row=" ".join([word for word in str(_row).split() if word not in stop_words])
    #Stemming
    _row = " ".join([stemmer.stem(word) for word in _row.split()])
    #Lemmatization
    _row = " ".join([lemmatizer.lemmatize(word) for word in _row.split()])
    #Split
    _row = _row.split()
    return _row

In [None]:
data['text'] = data['message'].apply(pre_processing)

In [None]:

X=data['text']

y=data['label']

In [None]:
from gensim.models import Word2Vec
import time
# Skip-gram model (sg = 1)
size = 100
window = 3
min_count = 1
workers = 3
sg = 1

OUTPUT_FOLDER=""

word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(size) + '.model'
start_time = time.time()
stemmed_tokens = pd.Series(data['text']).values
# Train the Word2Vec Model
w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, size = size, workers = workers, window = window, sg = sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)

In [None]:
import numpy as np

# Load the model from the model file
sg_w2v_model = Word2Vec.load(word2vec_model_file)

# Unique ID of the word
print("Index of the word 'hi':")
print(sg_w2v_model.wv.vocab["hi"].index)

# Total number of the words 
print(len(sg_w2v_model.wv.vocab))

In [None]:
# Store the vectors for train data in following file
word2vec_filename = OUTPUT_FOLDER + 'train_review_word2vec.csv'
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in enumerate(X.tolist()):
        model_vector = (np.mean([sg_w2v_model[token] for token in row], axis=0)).tolist()
        if index == 0:
            header = ",".join(str(ele) for ele in range(100))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(100)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

In [None]:
X_vectors=pd.read_csv('train_review_word2vec.csv')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.33, random_state=1)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(max_depth=10,random_state=1,learning_rate=0.05,seed=1)
model.fit(X_train, y_train)

In [None]:
y_pred=model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,accuracy_score

print(classification_report(y_pred,y_test))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred,y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

In [None]:
# predict probabilities
probs = model.predict_proba(X_test)

probs = probs[:, 1]

In [None]:
# calculate scores
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, probs)

In [None]:
# summarize scores

print(': ROC AUC=%.3f' % (auc))

In [None]:
# calculate roc curves
fpr, tpr, _ = roc_curve(y_test, probs)

In [None]:
from matplotlib import pyplot
pyplot.plot(fpr, tpr, marker='.', label='ROC')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()