##TRAINING THE MODEL

In [1]:
import pandas as pd

In [2]:
train_path = "../data/NqndMEyZakuimmFI.csv"
train_df = pd.read_csv(train_path)

In [3]:
train_info = train_df.info()
train_sample = train_df.sample(5, random_state=42)
train_distribution = train_df['fraudulent'].value_counts(normalize=True)

train_df.shape, train_info, train_sample, train_distribution

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14304 entries, 0 to 14303
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               14304 non-null  int64 
 1   title                14304 non-null  object
 2   location             14024 non-null  object
 3   department           5029 non-null   object
 4   salary_range         2283 non-null   object
 5   company_profile      11632 non-null  object
 6   description          14303 non-null  object
 7   requirements         12172 non-null  object
 8   benefits             8501 non-null   object
 9   telecommuting        14304 non-null  int64 
 10  has_company_logo     14304 non-null  int64 
 11  has_questions        14304 non-null  int64 
 12  employment_type      11547 non-null  object
 13  required_experience  8629 non-null   object
 14  required_education   7805 non-null   object
 15  industry             10378 non-null  object
 16  func

((14304, 18),
 None,
       job_id                                             title  \
 9844   16792                    Senior Developer Ruby on Rails   
 4509   12602             Beauty & Fragrance consultants needed   
 1010   14785                 IT Consultant / Software Enginner   
 7495   14229  SQL Server DBA Job opportunity at Barrington, IL   
 5173    6118                                      Web Designer   
 
                 location            department  salary_range  \
 9844      PL, WP, Poznań                   NaN           NaN   
 4509          GB, , Fife                   NaN           NaN   
 1010       GR, I, Athens  Consulting Workforce           NaN   
 7495  US, IL, Barrington                   NaN  90000-110000   
 5173            HU, BU,                    NaN           NaN   
 
                                         company_profile  \
 9844  Hello, we are Netguru and we love web developm...   
 4509  Established on the principles that full time e...   
 10

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [6]:
train_df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [14]:
target = 'fraudulent'
text_cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']
cat_cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function']
num_cols = ['telecommuting', 'has_company_logo', 'has_questions']
selected_cols = text_cols + cat_cols + num_cols + [target]

df = train_df[selected_cols].copy()
df.head()

Unnamed: 0,title,company_profile,description,requirements,benefits,employment_type,required_experience,required_education,industry,function,telecommuting,has_company_logo,has_questions,fraudulent
0,Contact Center Representatives,Tidewater Finance Co. was established in 1992 ...,"Tidewater Finance Company, located in Virginia...",The position requires the following qualificat...,Our company offers a competitive salary plus B...,Full-time,Entry level,Unspecified,Financial Services,Customer Service,0,1,0,0
1,Customer Service Associate,"Novitex Enterprise Solutions, formerly Pitney ...",The Customer Service Associate will be based i...,QualificationsMinimum of 1 year customer servi...,,Full-time,Entry level,High School or equivalent,Telecommunications,Customer Service,0,1,0,0
2,Automated Test Analyst,SilverStripe CMS &amp; Framework is an open so...,We are looking for a dedicated and passionate ...,,,Full-time,Mid-Senior level,,Information Technology and Services,,0,1,1,0
3,Inside Sales Professional-Omaha,"ABC Supply Co., Inc. is the nation’s largest w...","As a Sales Representative, you will provide as...","As a Sales Representative, you must have the a...",Your benefits package as a Sales Representativ...,Full-time,,,Building Materials,Sales,0,1,0,0
4,Content Marketing/SEO Manager,MeUndies is a lifestyle brand that is transfor...,MeUndies is a lifestyle brand that is transfor...,REQUIREMENTS/QUALIFICATIONS/PERSONAL ATTRIBUTE...,"WHY MEUNDIES?We're a fast-growing, VC-backed c...",Full-time,Mid-Senior level,Bachelor's Degree,Internet,Marketing,0,1,0,0


In [None]:
df['text'] = df[text_cols].fillna('').agg(' '.join, axis=1)


In [16]:
X = df[['text']+ cat_cols+ num_cols]
y = df[target]

In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

In [18]:
class_weights

array([ 0.52545735, 10.32034632])

In [19]:
text_transformer = Pipeline(steps=[
    ('tfdif', TfidfVectorizer(max_features=5000, stop_words='english'))
])

In [20]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [25]:
preprocessor = ColumnTransformer(transformers=[
    ('text', text_transformer, 'text'),
    ('cat', cat_transformer, cat_cols)
], remainder='passthrough')

In [24]:
preprocessor

0,1,2
,transformers,"[('text', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [26]:
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(max_iter=200, class_weight=class_weights_dict))
])

In [27]:
pipeline

0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,"{0: np.float64(0.5254573506722504), 1: np.float64(10.32034632034632)}"
,random_state,
,solver,'lbfgs'
,max_iter,200


In [29]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [30]:
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,"{0: np.float64(0.5254573506722504), 1: np.float64(10.32034632034632)}"
,random_state,
,solver,'lbfgs'
,max_iter,200


In [33]:
y_pred = pipeline.predict(X_val)
f1 = f1_score(y_val, y_pred)
report = classification_report(y_val, y_pred, output_dict=True)

f1, report

(0.6684636118598383,
 {'0': {'precision': 0.9942944085203499,
   'recall': 0.9603232916972814,
   'f1-score': 0.9770136423098487,
   'support': 2722.0},
  '1': {'precision': 0.5344827586206896,
   'recall': 0.8920863309352518,
   'f1-score': 0.6684636118598383,
   'support': 139.0},
  'accuracy': 0.9570080391471514,
  'macro avg': {'precision': 0.7643885835705198,
   'recall': 0.9262048113162666,
   'f1-score': 0.8227386270848435,
   'support': 2861.0},
  'weighted avg': {'precision': 0.9719547303183043,
   'recall': 0.9570080391471514,
   'f1-score': 0.9620229208024906,
   'support': 2861.0}})

In [34]:
import joblib
import os

In [43]:
model_dir = "../model"
os.makedirs(model_dir, exist_ok=True)

In [44]:
model_path = os.path.join(model_dir, "fraud_detector_pipeline.pkl")
joblib.dump(pipeline, model_path)

['../model\\fraud_detector_pipeline.pkl']

In [46]:
model_path

'../model\\fraud_detector_pipeline.pkl'