In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import string
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split, GridSearchCV
import time

%matplotlib inline
pd.set_option('display.max_colwidth', 100)

nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

## Pre-Process raw text data

Cleaning up text data is necessary to highlight data attributes that we are going to use for NLP data analysis. Cleaning or pre-processing data basically follows three steps:
 - Remove Punctuation
 - Tokenizing
 - Remove Stopwords
 - Stemming/ Lematizing
 
Let us apply above steps to perform data cleaning and processing.

In [None]:
def apply_styling(df: pd.DataFrame, caption: str = ""):
    '''
    Return @pd.DataFrame
    Input  @df:pd.DataFrame
           @caption: Stirng  
    It help to apply style to a particular dataframe which is passed into this
    '''
    #TODO: Styling dataframe after reading the file
    
    st = df.style.format({'percent on rent': '{:.0%}'}).hide_index()    
    st.set_table_styles([
           dict(selector="th", props=[('color', 'darkblue'), 
                                      ('vertical-align', 'top')]),
           dict(selector="th:first-child", props=[('max-width', '70px'), ('text-align', 'left')]),
           dict(selector="th:last-child", props=[('max-width', '50px')]),
           dict(selector="td:first-child", props=[('text-align', 'left')])
            ]) 
    st.caption = caption
    return st

def read_data(file_url: str, csv: bool = False, excel: bool = False, tab_delimeted: bool = False):
    '''
    Return : @pd.DataFrame
    Input : @file_url: String
            @csv: bool
            @excel: bool
            @tab_delimeted: bool
    
    It is utility function that helps to read a particular file and apply a particular styling
    to the dataframe, which is returned.
    '''
    #TODO: Reading the file after checking the type on the basis of flag.
    if csv:
        df = pd.read_csv(file_url)
    if excel:
        df = pd.read_excel(file_url)
    if tab_delimeted:
        df = pd.read_csv(file_url, delimiter='\t')
        
    if not (csv or excel or tab_delimeted):
        print("please specify file type")
        return None
    
    return df

In [None]:
file_url = "../input/sms-spam-collection-dataset/spam.csv"
caption = 'Spam-Ham Dataframe'
spam_ham_df = read_data(file_url ,csv=True)
spam_ham_df = spam_ham_df[['v1','v2']]
spam_ham_df.columns = ['Type', 'Body']

In [None]:
apply_styling(spam_ham_df.head(),caption)

### <u>Removing punctuation from the dataframe of Spam/Ham</u> 

In [None]:
def removePunct(text: str) -> str:
    '''
    @Return str
    @Input @text: str
    
    It removes the punctuations from the text by using 'string' module,
    and using its punctuations
    '''
    
    return "".join([txt for txt in text if txt not in string.punctuation])

In [None]:
#TODO: Applying @removePunct function to each message in spam_ham_df

spam_ham_df['Body_punct_clean'] = spam_ham_df['Body'].apply(lambda x : removePunct(x))

In [None]:
apply_styling(spam_ham_df.head(), caption)

### <u>Tokenizing each text in spam_ham_df</u>

In [None]:
def tokenize_text(text: str) -> list :
    '''
    @Return: List<str>
    @Input: text: String
    
    It converts the text into list of words by using regex 
    based apporach to split them on any non-character base.
    '''
    return re.split('\W+',text)

In [None]:
#TODO: Tokenize text using above function @tokenize_text and passing each text iteratively into that function.

spam_ham_df['Body_tokenize'] = spam_ham_df['Body_punct_clean'].apply(lambda x : tokenize_text(x))

In [None]:
apply_styling(spam_ham_df.head(),caption)

### <u>Stopword removal from the spam_ham_df tokenized sentences.</u>

In [None]:
def remove_stopword(text: [str]) -> [str] :
    '''
    Return @list[str]
           @Input: @text : list[str]
           
    It remove the stopwords from the tokenized sentences and return the
    list of strings without stopwords.
    '''
    
    return [txt for txt in text if txt not in stopword]

In [None]:
#TODO: Removing stopwords from tokenized sentence using @remove_stopword function from spam_ham_df

spam_ham_df['Body_without_stopword'] = spam_ham_df['Body_tokenize'].apply(lambda x: remove_stopword(x))

In [None]:
apply_styling(spam_ham_df.head(), caption)

### <u>Performing the Stemming on tokenized words of sentences.</u>

In [None]:
def stemming(text: [str]) -> [str]:
    """
    Return: [str]
    Input: @txt: [str]
    
    It perform stemming on each word of text, by using NLTK(natural language toolkit) 
    Porter Stemmer.
    """
    return [ps.stem(txt) for txt in text]

In [None]:
#TODO: Perform stemming on each word of tokenized sentences of spam_ham_df by using @stemming function.
# We are performing stemming to reduce the number of similar tokens that our model have to read.

spam_ham_df['Body_stemmed'] = spam_ham_df['Body_without_stopword'].apply(lambda x : stemming(x))

In [None]:
apply_styling(spam_ham_df.head(), caption)

### New Feature Exploration

#### <u>Considering Body Length as a feature</u>

In [None]:
#TODO: Performing feature engineering on the spam_ham_df 
#TODO: Exploring body length as a feature

spam_ham_df['Body_len'] = spam_ham_df['Body'].apply(lambda x: len(x) - x.count(" "))

In [None]:
apply_styling(spam_ham_df.head(), caption)

### <u>Creating feature for % punctuation in body text.</u>

In [None]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

spam_ham_df['punct%'] = spam_ham_df['Body'].apply(lambda x: count_punct(x))

In [None]:
apply_styling(spam_ham_df.head(), caption)

### <u>Evaluating above feature</u>

In [None]:
bins = np.linspace(0, 200, 40)

plt.hist(spam_ham_df[spam_ham_df['Type'] == "spam"]['Body_len'], bins, alpha= 0.5,density=True, label="spam")
plt.hist(spam_ham_df[spam_ham_df['Type'] == "ham"]['Body_len'], bins, alpha= 0.5,density=True, label="ham")
plt.legend(loc="upper left")
plt.show()

### <u>Observations : </u>
 - As we can see clearly that spam and ham are clearly sperable from each other on the basis of body length.
 - We don't require to perform any Box-Cox transformation on the data as it is a bimodial distribution of data.
 - There are certain messages with small lengths which are persent in both categories but significant number of 
   messages are clearly seperable from this feature.

In [None]:
bins = np.linspace(0, 50, 40)

plt.hist(spam_ham_df[spam_ham_df['Type'] == "spam"]['punct%'], bins, alpha= 0.5,density=True, label="spam")
plt.hist(spam_ham_df[spam_ham_df['Type'] == "ham"]['punct%'], bins, alpha= 0.5,density=True, label="ham")
plt.legend(loc="upper left")
plt.show()

### <u>Observations : </u>
 - This feature is not very significant as both the categories are not sperable from each other.
 - But we can perform transformation on the data as distribution is very skewed.

<strong><i>For more information about transformation [click here](https://machinelearningmastery.com/how-to-transform-data-to-fit-the-normal-distribution/).</i><strong>

### <u>Performing Transformation punct%</u>

In [None]:
#TODO: Performing transformation on the punct% data and considering only +ve values,
#      as -ve values will not help much in this scanerio

for i in [1,2,3,4,5]:
    plt.hist((spam_ham_df['punct%'])**(1/i), bins=40) # Not considering np.linspace for bins as bins size will vary as value will change according to i
    plt.title(f"Transformation of 1/{i}")
    plt.show()

### <u>Observations : </u>
 - As we are increasing value of i, our distribution is getting more and more normalized, as we can see at <strong>i = '4'</strong> and <strong>i = '5'</strong>, our distribution has become normalized.
 - We are seeing one standing rectangle at <strong>'0'</strong>, it is because of any things raised to <strong> power 0 is 0 </strong>.

### <u>Vectorizing data using TFIDFVectorizer</u>

In [None]:
#TODO: Vectorize body tokenized data from spam_ham_df but before that split data into test and train sets.

X_train, X_test, y_train, y_test = train_test_split(spam_ham_df[['Body', 'Body_len', 'punct%']], spam_ham_df['Type'], test_size=0.2)

In [None]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    return text

In [None]:
#TODO: Vectorize X_train, X_test datasets
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['Body'])

tfidf_train = tfidf_vect_fit.transform(X_train['Body'])
tfidf_test = tfidf_vect_fit.transform(X_test['Body'])

X_train_vect = pd.concat([X_train[['Body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['Body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

In [None]:
X_train_vect.head()

### <u>Create Models and evaluate them on the basis of Accuracy, Recall and Precision.</u>

##### Fitting RandomForestClassifier with different Hyper Parameter settings and using GridSearchCV

In [None]:
#TODO: To fit RandomForestClassifier with different hyper parameter settings.

gb = RandomForestClassifier()

param = { # different parameter settings
    'n_estimators': [10, 150, 300], 
    'max_depth': [30, 60, 90, None]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_train_vect, y_train)
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

#### Observations:
 - As we can see that max_depth is not affecting much to test_score but when we are changing the n_estimator, this parameter is affecting the test score of our model.
 
#### <u>Conclusion:</u>
 - We can say that <strong>max_depth</strong> can be <strong>None</strong> and <strong>n_estimators</strong> will be <strong>150</strong> as we have seen from above dataframe.

In [None]:
#TODO : Create a RandomForestClassifier with following configuration:
# @n_estimators =150
# @max_depth = None (Mean any amount of depth will do)
# @n_jobs = -1 (Mean perform parallization in tree creation)

#=======================================================================
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1) 
#=======================================================================
start = time.time()

rf_model = rf.fit(X_train_vect, y_train)
end = time.time()

fit_time = (end - start)

#=======================================================================
start = time.time()

y_pred = rf_model.predict(X_test_vect)
end = time.time()

pred_time = (end - start)
#=======================================================================

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')

print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

#### <u>Fitting GradientBoostingClassifier with different Hyper Parameter settings and using GridSearchCV</u>

In [None]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150], 
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_train_vect, y_train)
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

#### Observations:
 - As we can see that max_depth = 7 and n_estimators = 150
 
#### <u>Conclusion:</u>
 - We can say that <strong>max_depth</strong> can be <strong>7</strong> and <strong>n_estimators</strong> will be <strong>150</strong> as we have seen from above dataframe.

In [None]:
#TODO : Create a GradientBoostingClassifier with following configuration:
# @n_estimators =150
# @max_depth = 7

#=======================================================================
gb = GradientBoostingClassifier(n_estimators=150, max_depth=7)
#=======================================================================

#=======================================================================
start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)
#=======================================================================

#=======================================================================
start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)
#=======================================================================

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')

print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [None]:
text = "IMPORTANT - You could be entitled up to Â£3,160 in compensation from mis-sold PPI on a credit card or loan. Please reply PPI for info or STOP to opt out."
#=======================================================================
length = lambda x: len(x) - x.count(" ")
punct = count_punct(text)

predict_df = pd.DataFrame(data=[(length(text),punct)],columns =['Body_len', 'punct%'])
#=======================================================================
gb_model.predict(pd.concat([predict_df, pd.DataFrame(tfidf_vect_fit.transform([text]).toarray())], axis=1))

In [None]:
rf.predict(pd.concat([predict_df, pd.DataFrame(tfidf_vect_fit.transform([text]).toarray())], axis=1))

#### <u>Conclusion</u>
- As we can see both the models with hypertuned parameter configuration perform very well and with good accuracy.