### Importing the required modules/packages

In [83]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import re
import nltk
from nltk.corpus import stopwords
import string
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
from mlxtend.feature_selection import ColumnSelector
from sklearn.compose import ColumnTransformer
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading file and looking into the dimensions of data

In [84]:
df = pd.read_csv("spamraw_train.csv")
pd.set_option('display.max_colwidth',100)

print(f"Shape of Data --> {df.shape}\n")


Shape of Data --> (5000, 3)



In [85]:
df.head(2)

Unnamed: 0,id,sms_text,spam
0,1,Hope you are having a good week. Just checking in,0
1,2,K..give back my thanks.,0


## **Data Processing**

### Functions to Create new features and cleaning the data

In [86]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [87]:
specialChar = '#$%&()*+-/:;<=>?@[\]^_`{|}~'
moneysigns = '$£'

In [88]:
spamwords = ["free","recieve", "freemsg","winner", "urgent!",
             "charged","SMS","claim", "guaranteed", "end"]

In [89]:
## Creat punc
import re

def pc_SpecialChar(text):
    punct_count = sum([1 for char in text if char in specialChar])
    return (punct_count/(len(text) - text.count(' ')))*100

def clean_data(text):
    punct = "".join([word.lower() for word in text if word not in string.punctuation])
    splt = re.split('\W+',punct)    
    txt = [nltk.PorterStemmer().stem(word) for word in splt if word not in nltk.corpus.stopwords.words('english')]
    return "  ".join(txt)

def count_caps(text):
    splt = re.split('\W+',text)
    return len([word for word in splt if word.isupper()])

def count_exlc(text):
    i = 0
    for char in text:
      if char == '!':
        i += 1
    return i 

def count_fullstop(text):
    i = 0
    for char in text:
      if char == '.':
        i += 1
    return i

def count_at(text):
    i = 0
    for char in text:
      if char == '@':
        i += 1
    return i
# Has a number with more than 3 digits
def count_num(text):
    pattern = re.compile(r'\d\d\d.*')
    if len(pattern.findall(text)) > 0:
      return 1
    else: 
      return 0

# Has a url
def has_URL(text):
    pattern1 = re.compile(r'www.*')
    pattern2 = re.compile(r'https:*')
    if len(pattern1.findall(text))  > 0:
      return 1
    elif len(pattern1.findall(text)) > 0:
      return 1
    else: 
      return 0

def has_moneysymbol(text):
    i = 0
    for char in text:
      if char in '$£':
        i += 1
    return i 

def count_spamwords(text):
    i = 0
    for tokens in text.split():
      if tokens in spamwords:
        i += 1
    return i



def has_money(text):
    pattern1 = re.compile(r'[$£]\d[.\d][.\d]\d*')
    if len(pattern1.findall(text))  > 0:
      return 1
    else: 
      return 0


In [90]:
######################## Create new features Train and Test Data ########################
#- New features are created - 
#- 1) text_length (the total length of the text)
#- 2) Punct_pc (the percentage of punctuations in the text)
# -3) Total number of Caps words
# -4) Total number of words with exclamations in text

## New features Lexical
def new_features(df):
  df["SpecialChar_pc"] = df["sms_text"].apply(lambda x: pc_SpecialChar(x))
  df["text_length"] = df["sms_text"].apply(lambda x: len(x)-x.count(' '))
  df["caps_count"] = df["sms_text"].apply(lambda x: count_caps(x))
  df["exclamation_count"] = df["sms_text"].apply(lambda x: count_exlc(x))
  df["fullstop_count"] = df["sms_text"].apply(lambda x: count_fullstop(x))
  df["count_at"] = df["sms_text"].apply(lambda x: count_at(x))
  df["has_num"] = df["sms_text"].apply(lambda x: count_num(x))
  df["has_url"] = df["sms_text"].apply(lambda x: has_URL(x))
  df["has_moneysymbol"] = df["sms_text"].apply(lambda x: has_moneysymbol(x))
  df["has_money"] = df["sms_text"].apply(lambda x: has_money(x))

  ### Clean the Text
  df["Clean_text"] = df["sms_text"].apply(lambda x: clean_data(x))
  df["count_spamwords"] = df["Clean_text"].apply(lambda x: count_spamwords(x))
  return df

df = new_features(df)

In [91]:
y_train = df["spam"]
X_train = df.drop("spam",axis=1)

In [92]:
X_train.loc[X_train.sms_text == 'spam', :]

Unnamed: 0,id,sms_text,SpecialChar_pc,text_length,caps_count,exclamation_count,fullstop_count,count_at,has_num,has_url,has_moneysymbol,has_money,Clean_text,count_spamwords


Custom Transformer to select the Columns

In [93]:
## Customer transformer to Select features 
class ColumnExtractor(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        Xcols = pd.Series(X[self.cols])
        return (Xcols)

## Model Training

In [94]:
## Model Training
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier

**Random Forest**

In [95]:
lex_features = list(X_train.select_dtypes(exclude='O').columns)
print(lex_features)

['id', 'SpecialChar_pc', 'text_length', 'caps_count', 'exclamation_count', 'fullstop_count', 'count_at', 'has_num', 'has_url', 'has_moneysymbol', 'has_money', 'count_spamwords']


In [96]:
num_cols =['SpecialChar_pc', 'text_length', 'caps_count', 'exclamation_count', 'fullstop_count', 'count_at', 'has_num', \
'has_url', 'has_moneysymbol', 'has_money', 'count_spamwords']

In [97]:
text_pipe = Pipeline([('select_text', ColumnExtractor(cols="Clean_text")),('tfidf', TfidfVectorizer())])
num_pipe = Pipeline([('scaler', ColumnTransformer([('scaler', MinMaxScaler(),num_cols)],
                                                             ))])
full_pipe = Pipeline([
    ('Fu', FeatureUnion([("text", text_pipe), ("num",num_pipe)])),
    ('RF', RandomForestClassifier(random_state=123))
])

In [98]:
param_grid = {
                 "RF__max_depth": [25, None],
                 "RF__n_estimators": [15,20,25],
                 "RF__max_features" : [50,60,'auto'],
                 "RF__class_weight":['balanced'],
                 'Fu__text__tfidf__max_df': [0.3],
                #'Fu__text__tfidf__min_df': [],
              'Fu__text__tfidf__ngram_range': [(1,4),(1,1)]
              
                } 

In [99]:
rf_grid = GridSearchCV(full_pipe,param_grid,cv=2,n_jobs=-1, verbose=3, scoring='f1_macro')
rf_grid_fit = rf_grid.fit(X_train, y_train)

Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   20.4s finished


In [100]:
est = rf_grid_fit.best_estimator_
print(rf_grid_fit.best_score_)

0.9617019102204918


In [101]:
#print(est[-2])
rf_grid_fit.best_params_

{'Fu__text__tfidf__max_df': 0.3,
 'Fu__text__tfidf__ngram_range': (1, 1),
 'RF__class_weight': 'balanced',
 'RF__max_depth': 25,
 'RF__max_features': 50,
 'RF__n_estimators': 25}

**XGboost**

In [102]:
from xgboost import XGBClassifier
full_pipe_xgb = Pipeline([
    ('Fu_xgb', FeatureUnion([("text", text_pipe), ("num",num_pipe)])),
    ('XGB', XGBClassifier(random_state=123))
])
param_grid = {
                 "XGB__max_depth": [10,12, None],
                 "XGB__n_estimators": [120,125],
                 "XGB__max_features" : [35,38,39,'auto'],
                 'Fu_xgb__text__tfidf__max_df': [0.3,0.4],
                'Fu_xgb__text__tfidf__min_df': [0.1],
              'Fu_xgb__text__tfidf__ngram_range': [(1,4)]
              
                } 

In [103]:
xgb_grid = GridSearchCV(full_pipe_xgb,param_grid,cv=3,n_jobs=-1, verbose=3, scoring='f1_macro')
xgb_grid_fit = xgb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  1.2min finished


In [104]:
print(xgb_grid_fit.best_score_)

0.9550396102823099


In [105]:
#print(est[-2])
xgb_grid_fit.best_params_

{'Fu_xgb__text__tfidf__max_df': 0.3,
 'Fu_xgb__text__tfidf__min_df': 0.1,
 'Fu_xgb__text__tfidf__ngram_range': (1, 4),
 'XGB__max_depth': 10,
 'XGB__max_features': 35,
 'XGB__n_estimators': 125}

## Prediction

In [106]:
X_test = pd.read_csv("spamraw_test.csv")

## New features Lexical
X_test = new_features(X_test)

In [107]:
list(X_test.columns)

['id',
 'sms_text',
 'SpecialChar_pc',
 'text_length',
 'caps_count',
 'exclamation_count',
 'fullstop_count',
 'count_at',
 'has_num',
 'has_url',
 'has_moneysymbol',
 'has_money',
 'Clean_text',
 'count_spamwords']

In [108]:
## Prediction XGBoost
y_pred_xg = xgb_grid_fit.predict(X_test)

In [109]:
## Prediction Random Forest
y_pred_rand = rf_grid_fit.predict(X_test)

In [110]:
## Final Model 
## Load submission file and update predictions using all models
predvt1 = pd.DataFrame(y_pred_rand, columns=["Predicted"])
predvt1.to_csv("rf.csv", index=False)

In [111]:
## Final Model 
## Load submission file and update predictions using all models
predvt2 = pd.DataFrame(y_pred_xg, columns=["Predicted"])
predvt2.to_csv("xg.csv", index=False)