In [None]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')
nltk.download('wordnet')
# from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
df =pd.read_csv("../input/sms-spam-collection-dataset/spam.csv")
df.tail()

In [None]:
df.shape

### Remove useless columns which have more than 90percent values are null

In [None]:
df1 = df.drop(columns={'Unnamed: 2','Unnamed: 3','Unnamed: 4'})
df1.shape

# EDA Steps
### Becuse it is a problem of text so we will do text cleanup techniquests in following mannaer:
- Lowercasing
- Punctuation removal
- Stop words removal
- Text standardization
- Tokenization
- Stemming
- Lemmatization
- Exploratory data analysis



# 1. Lowering SMS text

In [None]:
 df1['v2'] = df1['v2'].str.lower()


# 2. Remove Punctuation

In [None]:
def removePunctuation(text):
    return "".join([txt for txt in text if txt not in string.punctuation])


    

In [None]:
df1['Body_punct_clean'] = df1['v2'].apply(lambda x : removePunctuation(x))

In [None]:
df1['Body_punct_clean'].tail(10)

# 3. Tokennization  

In [None]:
def Tokenization(text):
       return re.split('\W+',text)
    

In [None]:
df1['v1_Tokenized'] = df1['Body_punct_clean'].apply(lambda x : Tokenization(x))

In [None]:
df1['v1_Tokenized'].head(5)

# 4. Remove Stop worrds

In [None]:
def RemoveStopWords(text):
     return [txt for txt in text if txt not in stopword]
    

In [None]:
df1['removed_stopword']= df1['v1_Tokenized'].apply(lambda x : RemoveStopWords(x))

In [None]:
df1['removed_stopword'].head(10)

# 5.Lemitization:

In [None]:
wml = WordNetLemmatizer()

def lemmatise(lowercase_word):
    lemma = []
    for word in lowercase_word:
        tokens = wml.lemmatize(word)
        lemma.append(tokens)
    return lemma

In [None]:
df1['lemitized_sms'] = df1['removed_stopword'].apply(lambda x : lemmatise(x))

In [None]:
df1['lemitized_sms'].head(10)

In [None]:
def apply_styling(df: pd.DataFrame, caption: str = ""):
    '''
    Return @pd.DataFrame
    Input  @df:pd.DataFrame
           @caption: Stirng  
    It help to apply style to a particular dataframe which is passed into this
    '''
    #TODO: Styling dataframe after reading the file
    
    st = df.style.format({'percent on rent': '{:.0%}'}).hide_index()    
    st.set_table_styles([
           dict(selector="th", props=[('color', 'darkblue'), 
                                      ('vertical-align', 'top')]),
           dict(selector="th:first-child", props=[('max-width', '70px'), ('text-align', 'left')]),
           dict(selector="th:last-child", props=[('max-width', '50px')]),
           dict(selector="td:first-child", props=[('text-align', 'left')])
            ]) 
    st.caption = caption
    return st

In [None]:
apply_styling(df1.head(20),'spam-sms')

# New Feature Exploration
#### Considering Body Length as a feature

In [None]:
df1['Body_len'] = df1['v2'].apply(lambda x: len(x) - x.count(" "))

apply_styling(df1.head(), 'Feature-Enginering')

# Visualize the spam and ham length 

In [None]:
def showPlot():
    bins = np.linspace(0, 200, 40)
    plt.hist(df1[df1['v1'] == "spam"]['Body_len'], bins, alpha= 0.5,density=True, label="spam")
    plt.hist(df1[df1['v1'] == "ham"]['Body_len'], bins, alpha= 0.5,density=True, label="ham")
    plt.legend(loc="upper left")
    plt.show()


    


In [None]:
showPlot()

#### Observation: Spam msg have higher text length

In [None]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

df1['punct%'] = df1['v2'].apply(lambda x: count_punct(x))

In [None]:
showPlot()

# Now do vectorization and convert text to numbers

In [None]:

# X_train, X_test, y_train, y_test = train_test_split(df1, test_size=0.2)


X_train, X_test, y_train, y_test = train_test_split(df1['lemitized_sms'], df1['v1'], test_size=0.2)

In [None]:
y_train

In [None]:
vectorizer = TfidfVectorizer()
# vectors = vectorizer.fit(X_train['lemitized_sms'].apply(lambda x: ' '.join(x)))
# train_tf_idf = vectors.transform(X_train['lemitized_sms'].apply(lambda x: ' '.join(x)))
# test_tf_idf = vectors.transform(X_test['lemitized_sms'].apply(lambda x: ' '.join(x)))

vectors = vectorizer.fit(X_train.apply(lambda x: ' '.join(x)))
train_tf_idf = vectors.transform(X_train.apply(lambda x: ' '.join(x)))
test_tf_idf = vectors.transform(X_test.apply(lambda x: ' '.join(x)))




In [None]:
train_tf_idf

In [None]:
test_tf_idf

# Random firest Model

In [None]:
rf= RandomForestClassifier()
rf.fit(train_tf_idf,y_train)
train_pred= rf.predict(train_tf_idf)
test_pred= rf.predict(test_tf_idf)

In [None]:
def evaluation_report(gt,pred):
    print(confusion_matrix(gt,pred))
    print(classification_report(gt,pred))

In [None]:
print("TRAINING")
evaluation_report(y_train,train_pred)
print("\n\n************************\nTESTING")
evaluation_report(y_test,test_pred)

In [None]:
from sklearn import metrics

print ('Accuracy:',metrics.accuracy_score(y_test,test_pred))

# Fully Connected Newral Network