# Train script

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from urllib.parse import urlparse
import pickle 

# Import ML library and its modules
import sklearn
 
# Library for Dealing with imbalanced datasets
import imblearn
from imblearn.over_sampling import SMOTE

## Import train_test_split, cross_val_score, KFold - Validation and GridSearchCV  
from sklearn.model_selection import  train_test_split, cross_val_score, GridSearchCV, KFold 

# Import Classifiers - Modelling
from sklearn.ensemble import RandomForestClassifier 
 
# Data Engineering
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import Metrics - Performance Evaluation
from sklearn import metrics

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
pd.set_option('display.max_columns', None)

%matplotlib inline

In [4]:
# Parameters
feature_importance_threshold = 0.005
test_size = 0.2
random_state = 100
path="./"
file_name="sampled_webpages_classification_train_data.csv"

In [5]:
def load_data(path, file_name):
    df = pd.read_csv(path + file_name)
    return df

In [6]:
def clean_data(df):
    df['label']=np.where(df['label']=='bad', 1, 0) # bad=1 and good=0

##  Useful if there is any data conversion that needs to be done by numerical or categorical features
#   num_cols = (train.select_dtypes(exclude="object").columns.values).tolist()
#   cat_cols = train.select_dtypes(include="object").columns.values.tolist()
    return df

In [7]:
# def visualize_data(df):
#     # Numerical 
#     ## Univariate
#     ## Bivariate
#     ## Multivariate
    
#     # Categorical 
#     ## Univariate
#     ## Bivariate
#     ## Multivariate

In [8]:
def data_extractor(df):
    
    # Extract URL features 
    df["asperand_symbol"] = np.where(df['url'].str.contains("@"), 1, 0) # Existence of Asperand - @ symbol
    df["redirection_symbol"] = np.where(df['url'].str.removeprefix("http://").str.removeprefix("https://").str.contains("//"),
                                        1, 0) # Redirection Symbol // symbol
    
    
    df["hyphen_symbol"] = np.where(df["url"].apply(lambda x: urlparse(x).netloc).str.contains("-"),
                                   1, 0) # Hyphen(-) Symbol
    
    df["multilevel_subdomains"] = np.where(df["url"].apply(lambda x: urlparse(x).netloc).str.count("\.") > 3, 1, 0)
    
    df['content_len'] = df["content"].str.len()
    
    # Extract content features 
    df["iframe"] = np.where(df["content"].str.findall("<iframe>"), 1, 0) # Presence of iframe
    df["no_of_iframes"]= df["content"].str.findall("<iframe>").apply(lambda x: len(x)) # No. of iframes 
    df["no_of_find_fn"] = df["content"].str.findall("find\(\)").apply(lambda x: len(x)) # No. of find() fns used
    df["no_of_eval_fn"] = df["content"].str.findall("eval\(\)").apply(lambda x: len(x))  # No. of eval() fns used
    
    df["content"] = df["content"].str.lower()
    df["content"] = df["content"].str.replace('\d+'," ", regex=True) # Replacing numbers with spaces   
    df['content_len'] = df['content'].str.len() 
    
    return df


In [9]:
# def content_tf_idf_vectorizer(df, max_features=150, stop_words='english'): 
        
#     vectorizer = TfidfVectorizer(max_features=max_features, stop_words="english") # Defining vectorizer
#     vectorized_content = vectorizer.fit_transform(train["content"] )
    
#     vectorized_content_df = pd.DataFrame(data=vectorized_content.toarray(), columns=vectorizer.get_feature_names_out())
#     df = df.join(vectorized_content_df, how="left")
#     return df

In [10]:
def remove_url_ip_content_features(df): # As they are unique and do not help in our modeling
    df.drop(['url', 'ip_add', 'content'], axis=1, inplace =True)
    return df

In [11]:
def df_dictvectorizer(df):    
    
    dv = DictVectorizer(sparse=False)
    
    df_dict = df.to_dict(orient='records')
    df_dv = dv.fit_transform(df_dict)
    
    df_dv_cols = list(dv.get_feature_names_out())
    
    df = pd.DataFrame(data=df_dv, columns=df_dv_cols)
    
    return dv, df

In [12]:
def train_validation_split(df):
    X = df.drop('label', axis=1)
    y = df['label']
    
    x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    
    return  x_train, x_val, y_train, y_val

In [13]:
# Balancing the dataset samples to get equal samples for both classes to train
def train_balance_resampling(x_train, y_train):
    from imblearn.over_sampling import SMOTE

    SMOTE = SMOTE()
    x_train_SMOTE, y_train_SMOTE = SMOTE.fit_resample(x_train, y_train) # fit and resample to get equal samples for both classes
    
    return  x_train_SMOTE, y_train_SMOTE

In [14]:
# Model based Feature selection using RandomForestClassifier to select feature to consider for training
def feature_selection(x_train, y_train):
    
    rndf = RandomForestClassifier(n_estimators=150)
    rndf.fit(x_train, y_train)
    
    importance = pd.DataFrame.from_dict({'cols':x_train.columns, 'importance': rndf.feature_importances_})
    importance = importance.sort_values(by='importance', ascending=False)
    
    imp_cols = list(importance[importance.importance >= feature_importance_threshold].cols.values)
    
    return x_train[imp_cols], y_train, imp_cols

In [15]:
def training(path, file_name):
    
    df = load_data(path, file_name)
    df = clean_data(df)
    df = data_extractor(df)
    df = remove_url_ip_content_features(df)
    dv, df = df_dictvectorizer(df)
    
    x_train, x_val, y_train, y_val = train_validation_split(df)
    
    x_train_SMOTE, y_train_SMOTE = train_balance_resampling(x_train, y_train)
    
    x_train_SMOTE, y_train_SMOTE, imp_cols = feature_selection(x_train_SMOTE, y_train_SMOTE)
   
    # Hyperparameter tuning with GridSearchCV i.e. includes K-Fold crossvalidation
    param_grid = {'criterion':['gini','entropy','cross_entropy'],
                  'max_depth':[2, 3, 4, 5, 6, 7, 8, 9],
                  'random_state':[100],
                  'n_estimators':[200,400,600],
                  'n_jobs':[-1], 
                  'random_state':[100],
                  'verbose': [0]
                 }

    RF_grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv = 5, scoring='balanced_accuracy', verbose=1)

    RF_grid.fit(x_train_SMOTE, y_train_SMOTE)
    
    model = RF_grid.best_estimator_
     
    return imp_cols, dv, model

In [16]:
# Assigning the selected_features, dictvectorizer and the trained model objects to a variable
selected_features, dv, model = training(path, file_name) 

Fitting 5 folds for each of 72 candidates, totalling 360 fits


# Save the model

In [17]:
model_file = "model.bin"

with open(model_file,'wb') as f_out:
    pickle.dump((selected_features, dv, model), f_out) # Saving the selected_features, dictvectorizer and the trained model objects to a file


In [18]:
print(f"The model is saved to {model_file}")

The model is saved to model.bin
