In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import tensorflow
import joblib
from os.path import exists
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.models import Sequential
from keras.layers import LSTM
from tensorflow.keras.layers import Embedding,  Bidirectional
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score


2022-12-10 14:36:31.403822: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Analyze Data
first we analyze data, how many fields does it have, how good they are? is there balanced data or NaN in there? how many category for categorical values? etc.

In [2]:
df = pd.read_csv("fake_job_postings.csv")
df = df.fillna("")
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [3]:
mmap = {}
for l in df['industry']:
    if l == '': continue
    if l in mmap:
        mmap[l] += 1
    else:
        mmap[l] = 1
mmap

{'Marketing and Advertising': 828,
 'Computer Software': 1376,
 'Hospital & Health Care': 497,
 'Online Media': 101,
 'Information Technology and Services': 1734,
 'Financial Services': 779,
 'Management Consulting': 130,
 'Events Services': 50,
 'Internet': 1062,
 'Facilities Services': 94,
 'Consumer Electronics': 62,
 'Telecommunications': 342,
 'Consumer Services': 358,
 'Construction': 158,
 'Oil & Energy': 287,
 'Education Management': 822,
 'Building Materials': 78,
 'Banking': 84,
 'Food & Beverages': 72,
 'Food Production': 44,
 'Health, Wellness and Fitness': 127,
 'Insurance': 123,
 'E-Learning': 139,
 'Cosmetics': 65,
 'Staffing and Recruiting': 127,
 'Venture Capital & Private Equity': 29,
 'Leisure, Travel & Tourism': 76,
 'Human Resources': 108,
 'Pharmaceuticals': 42,
 'Farming': 24,
 'Legal Services': 97,
 'Luxury Goods & Jewelry': 4,
 'Machinery': 11,
 'Real Estate': 175,
 'Mechanical or Industrial Engineering': 37,
 'Public Relations and Communications': 58,
 'Consum

In [4]:
mmap = {}
for l in df['function']:
    if l == '': continue
    if l in mmap:
        mmap[l] += 1
    else:
        mmap[l] = 1
mmap

{'Marketing': 830,
 'Customer Service': 1229,
 'Sales': 1468,
 'Health Care Provider': 338,
 'Management': 317,
 'Information Technology': 1749,
 'Other': 325,
 'Engineering': 1348,
 'Administrative': 630,
 'Design': 340,
 'Production': 116,
 'Education': 325,
 'Supply Chain': 36,
 'Business Development': 228,
 'Product Management': 114,
 'Financial Analyst': 33,
 'Consulting': 144,
 'Human Resources': 205,
 'Project Management': 183,
 'Manufacturing': 74,
 'Public Relations': 76,
 'Strategy/Planning': 46,
 'Advertising': 90,
 'Finance': 172,
 'General Business': 68,
 'Research': 50,
 'Accounting/Auditing': 212,
 'Art/Creative': 132,
 'Quality Assurance': 111,
 'Data Analyst': 82,
 'Business Analyst': 84,
 'Writing/Editing': 132,
 'Distribution': 24,
 'Science': 14,
 'Training': 38,
 'Purchasing': 15,
 'Legal': 47}

In [5]:
edu_map = {}
for l in df['required_education']:
    if l == '': continue
    if l in edu_map:
        edu_map[l] += 1
    else:
        edu_map[l] = 1
edu_map

{"Bachelor's Degree": 5145,
 "Master's Degree": 416,
 'High School or equivalent': 2080,
 'Unspecified': 1397,
 'Some College Coursework Completed': 102,
 'Vocational': 49,
 'Certification': 170,
 'Associate Degree': 274,
 'Professional': 74,
 'Doctorate': 26,
 'Some High School Coursework': 27,
 'Vocational - Degree': 6,
 'Vocational - HS Diploma': 9}

In [6]:
locNum = {}
for l in df['location']:
    frags = l.split(',')
    country = frags[0]
    if country in locNum:
        locNum[country] += 1
    else:
        locNum[country] = 1
locNum

{'US': 10656,
 'NZ': 333,
 'DE': 383,
 'GB': 2384,
 'AU': 214,
 'SG': 80,
 'IL': 72,
 'AE': 54,
 'CA': 457,
 'IN': 276,
 'EG': 52,
 'PL': 76,
 'GR': 940,
 '': 346,
 'PK': 27,
 'BE': 117,
 'BR': 36,
 'SA': 15,
 'DK': 42,
 'RU': 20,
 'ZA': 40,
 'CY': 11,
 'HK': 77,
 'TR': 17,
 'IE': 114,
 'LT': 23,
 'JP': 20,
 'NL': 127,
 'AT': 14,
 'KR': 10,
 'FR': 70,
 'EE': 72,
 'TH': 10,
 'PA': 9,
 'KE': 7,
 'MU': 14,
 'MX': 18,
 'RO': 46,
 'MY': 21,
 'FI': 29,
 'CN': 15,
 'ES': 66,
 'SE': 49,
 'CL': 2,
 'UA': 13,
 'QA': 21,
 'IT': 31,
 'LV': 6,
 'IQ': 10,
 'BG': 17,
 'PH': 132,
 'CZ': 6,
 'VI': 3,
 'MT': 13,
 'HU': 14,
 'BD': 2,
 'KW': 2,
 'LU': 9,
 'NG': 10,
 'RS': 7,
 'BY': 9,
 'VN': 4,
 'ID': 13,
 'ZM': 2,
 'NO': 8,
 'BH': 9,
 'UG': 1,
 'CH': 15,
 'TT': 4,
 'SD': 1,
 'SK': 2,
 'AR': 9,
 'TW': 4,
 'PT': 18,
 'PE': 1,
 'CO': 1,
 'IS': 2,
 'SI': 1,
 'MA': 1,
 'AM': 2,
 'TN': 2,
 'GH': 1,
 'AL': 1,
 'HR': 1,
 'CM': 1,
 'SV': 1,
 'NI': 4,
 'LK': 2,
 'JM': 1,
 'KZ': 1,
 'KH': 1}

In [7]:
salary_map = {}
for l in df['salary_range']:
    if l == '': continue
    if l in locNum:
        salary_map[l] += 1
    else:
        salary_map[l] = 1
for key in salary_map:
    print(key,':',salary_map[key],end = ' ')

20000-28000 : 1 100000-120000 : 1 120000-150000 : 1 50000-65000 : 1 40000-50000 : 1 60-80 : 1 65000-70000 : 1 75-115 : 1 75000-110000 : 1 17000-20000 : 1 16000-28000 : 1 95000-115000 : 1 15000-18000 : 1 50000-70000 : 1 45000-60000 : 1 30000-40000 : 1 70000-90000 : 1 10000-14000 : 1 50-110 : 1 28000-45000 : 1 0-34300 : 1 35000-40000 : 1 9-Dec : 1 44000-57000 : 1 18500-28000 : 1 55000-75000 : 1 30000-35000 : 1 0-0 : 1 20000-40000 : 1 360000-600000 : 1 50000-80000 : 1 80000-100000 : 1 52000-78000 : 1 15750-15750 : 1 40000-65000 : 1 45000-50000 : 1 30000-37000 : 1 45000-67000 : 1 35000-100000 : 1 180000-216000 : 1 45000-65000 : 1 28000-32000 : 1 0-1000 : 1 36000-40000 : 1 80000-110000 : 1 35000-73000 : 1 19000-19000 : 1 60000-120000 : 1 120000-15000000 : 1 42000-55000 : 1 90000-120000 : 1 100000-150000 : 1 28000-38000 : 1 1600-1700 : 1 50000-60000 : 1 30000-70000 : 1 32000-40000 : 1 50-100 : 1 9000-17000 : 1 23040-28800 : 1 105-110 : 1 13000-16000 : 1 100000-180000 : 1 45000-55000 : 1 9000

## Data Cleaning
Because we are operating the plane in a algebra space, we only want numerical values. For all the 17 features, I would perform data pre processing as follows:

 job id: maybe related to job created time? treat as integer
 
 title: split into several features, like intern, account, etc. As first input, could gather into text vector.
 
 location: split into US and non US group because lack of data in some minor countries, like: is_US(1 for in US 0 for not) 
 
 department, missing too much data, maybe will collect as text
 
 salary range, missing data, may use as 3, has_salary_range, salary_lower, salary_higher, treat as int.
 
 company profile: vectorize, gather as text vector.
 
 desc, req, vectorize, split, gather as text vector.
 
 benefits, missing data, may use and vectorize. If there are, gather as text vector.
 
 telecommuting	has_company_logo	has_questions, use as it is
 
 mployment_type	required_experience	required_education	industry	function	split and use, maybe one more feature for having NaN or not
 
 fraudulent: target

In [8]:
CleanDf = pd.DataFrame()

In [9]:
CleanDf['telecommuting'] = df['telecommuting']
CleanDf['has_company_logo'] = df['has_company_logo']
CleanDf['has_questions'] = df['has_questions']

In [10]:
edu_map = {}
for l in df['required_education']:
    if l == '': continue
    if l in edu_map:
        edu_map[l] += 1
    else:
        edu_map[l] = 1
add_map = {}
add_map['has_edu_require'] = []
for key in edu_map:
    add_map[key] = []
for l in df['required_education']:
    if l == '':
        add_map['has_edu_require'].append(0)
        for key in edu_map:
            add_map[key].append(0)
    else:
        add_map['has_edu_require'].append(1)
        for key in edu_map:
            if key == l: add_map[key].append(1)
            else: add_map[key].append(0)
for key in add_map:
    CleanDf[key] = add_map[key]

In [11]:
salary_label = []
lower_salary = []
upper_salary = []
for l in df['salary_range']:
    try:
        frags = l.split('-')
        a = int(frags[0])
        b = int(frags[1])
        salary_label.append(1)
        lower_salary.append(a)
        upper_salary.append(b)
    except:
        salary_label.append(0)
        lower_salary.append(0)
        upper_salary.append(0)
# print(salary_label,lower_salary,upper_salary)
# min max normalization
minimum = min(lower_salary)
diff = max(lower_salary) - min(lower_salary)
for i in range(len(lower_salary)): lower_salary[i] = (lower_salary[i] - minimum)/diff

minimum = min(upper_salary)
diff = max(upper_salary) - min(upper_salary)
for i in range(len(upper_salary)): upper_salary[i] = (upper_salary[i] - minimum)/diff

CleanDf['salary_label'] = salary_label
CleanDf['lower_salary'] = lower_salary
CleanDf['upper_salary'] = upper_salary

In [12]:
is_US = []
for l in df['location']:
    frags = l.split(',')
    if frags[0] == 'US': is_US.append(1)
    else: is_US.append(0)
CleanDf['in_US'] = is_US

In [13]:
# IT / CS industry?

In [14]:
exp_map = {}
for l in df['required_experience']:
    if l == '': continue
    if l in exp_map:
        exp_map[l] += 1
    else:
        exp_map[l] = 1
add_map = {}
add_map['has_exp_require'] = []
for key in exp_map:
    add_map[key] = []
for l in df['required_experience']:
    if l == '':
        add_map['has_exp_require'].append(0)
        for key in exp_map:
            add_map[key].append(0)
    else:
        add_map['has_exp_require'].append(1)
        for key in exp_map:
            if key == l: add_map[key].append(1)
            else: add_map[key].append(0)
for key in add_map:
    CleanDf[key] = add_map[key]

In [15]:
func_map = {}
for l in df['function']:
    if l == '': continue
    if l in func_map:
        func_map[l] += 1
    else:
        func_map[l] = 1
add_map = {}
add_map['has_function'] = []
for key in func_map:
    add_map[key] = []
for l in df['function']:
    if l == '':
        add_map['has_function'].append(0)
        for key in func_map:
            add_map[key].append(0)
    else:
        add_map['has_function'].append(1)
        for key in func_map:
            if key == l: add_map[key].append(1)
            else: add_map[key].append(0)
for key in add_map:
    CleanDf[key] = add_map[key]

In [16]:
CleanDf['fraudulent'] = df['fraudulent']

In [17]:
CleanDf

Unnamed: 0,telecommuting,has_company_logo,has_questions,has_edu_require,Bachelor's Degree,Master's Degree,High School or equivalent,Unspecified,Some College Coursework Completed,Vocational,...,Quality Assurance,Data Analyst,Business Analyst,Writing/Editing,Distribution,Science,Training,Purchasing,Legal,fraudulent
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17876,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17878,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Vectorize 
The description should contain very much detail, we will need to vectorize it. One possible solution is TF-idf vectorization, but this method will calculate the frequency only while ignoring the connection between words during NLP processing. I prefer to use tensorflow Tokenizer to vectorize description to catch words order infomation. Preparing for a LSTM network analysis.

In [18]:
df['text'] = df['title'] + " " + df['department'] + \
             " " + df['company_profile'] + " " + \
             df['description'] + " " + \
             df['requirements'] + " " +\
             df['benefits'] + " " +\
             df['function'] + " " +\
             df['required_experience']+ " "+\
             df["required_education"]+ " " +\
             df["industry"]
df_last = df.drop(columns = ['job_id','title','location','department', 'telecommuting',
                             'salary_range','company_profile','description','requirements','benefits','employment_type',
                             'required_experience','required_education','industry','function'])
print(df_last.isna().sum())
df_last.head()

has_company_logo    0
has_questions       0
fraudulent          0
text                0
dtype: int64


Unnamed: 0,has_company_logo,has_questions,fraudulent,text
0,1,0,0,"Marketing Intern Marketing We're Food52, and w..."
1,1,0,0,Customer Service - Cloud Video Production Succ...
2,1,0,0,Commissioning Machinery Assistant (CMA) Valor...
3,1,0,0,Account Executive - Washington DC Sales Our pa...
4,1,1,0,Bill Review Manager SpotSource Solutions LLC ...


In [19]:
def text_clean():
    text_list = []
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    for text in df_last.text:
        text = re.sub("[^a-zA-Z]", " ", text)
        text = text.lower()
        text = text.strip()
        text = nltk.word_tokenize(text)
        text = [word for word in text if not word in set(stopwords.words("english"))] # dropping stopwords
        lemma = nltk.WordNetLemmatizer()
        text = [lemma.lemmatize(word) for word in text]
        text = " ".join(text)
        text = text.replace('  ',' ')
        text_list.append(text)
    dfData = pd.DataFrame()
    dfData['data'] = text_list
    dfData['target'] = df['fraudulent']
    dfData.to_csv('washtext.csv')
if exists('./washtext.csv'): dfData = pd.read_csv('washtext.csv')
else: dfData = text_clean()
dfData = dfData[['data','target']]
text_list = dfData['data']

In [20]:
dfData

Unnamed: 0,data,target
0,marketing intern marketing food created ground...,0
1,customer service cloud video production succes...,0
2,commissioning machinery assistant cma valor se...,0
3,account executive washington dc sale passion i...,0
4,bill review manager spotsource solution llc gl...,0
...,...,...
17875,account director distribution sale vend lookin...,0
17876,payroll accountant accounting weblinc e commer...,0
17877,project cost control staff engineer cost contr...,0
17878,graphic designer nemsia studio looking experie...,0


In [21]:
max_features = 10000
max_vecLen = max([len(i.split(' ')) for i in dfData['data']])
t = Tokenizer(num_words = max_features)
# fit the tokenizer on the documents
t.fit_on_texts(text_list)
encoded_docs = t.texts_to_sequences(text_list)
joblib.dump(t, './tokinezer_file') # For future Use, if we have more instances coming..

['./tokinezer_file']

In [22]:
embedded_docs=pad_sequences(encoded_docs,padding='pre',maxlen=max_vecLen)
print(embedded_docs)# all reviews must be same lenght. we equals all reviews lenght
embedded_docs.shape

[[   0    0    0 ...  132   26  667]
 [   0    0    0 ...  580   26  280]
 [   0    0    0 ...  568   24 1515]
 ...
 [   0    0    0 ...   10  646   11]
 [   0    0    0 ...   57  869   28]
 [   0    0    0 ...   27  122   36]]


(17880, 1426)

## Data Integration and Split
We will unite all data we have, split into train test set. then we use train set to produce a LSTM Network and get it's prediction to the whole set, use this as a parameter to out SVM or ANN followed.

In [23]:
dfWhole = CleanDf
dfWhole['text'] = text_list
WholeX = dfWhole.drop('fraudulent',axis = 1)
WholeY = dfWhole['fraudulent']

In [24]:
x_train_w, x_test_w, y_train, y_test = train_test_split(WholeX, WholeY, test_size= 0.1, random_state= 1)
# print(x_train.shape, x_test.shape, y_train.shape, y_test.shape, sum(y_train), sum(y_test))

In [25]:
x_train = pad_sequences(t.texts_to_sequences(x_train_w['text']),padding='pre',maxlen=max_vecLen) # for text
x_test = pad_sequences(t.texts_to_sequences(x_test_w['text']),padding='pre',maxlen=max_vecLen)

## LSTM Network analysis
those "data" vectors are transformed text while keeping the original data order. Different length of that will result in a empty training prefix "0". Therefore those data are ready for Long Short Term Memory network analysis.

In [26]:
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(max_features,embedding_vector_features,input_length=max_vecLen))
model1.add(Bidirectional(LSTM(20)))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

2022-12-10 14:36:49.001179: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1426, 40)          400000    
                                                                 
 bidirectional (Bidirectiona  (None, 40)               9760      
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 41        
                                                                 
Total params: 409,801
Trainable params: 409,801
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

cp = ModelCheckpoint("./bidirectional_model/" ,save_best_only = True)## creaitng model checkpoint
hist = model1.fit(x_train, y_train, epochs = 4, batch_size = 64,  callbacks = cp, validation_data=(x_test,y_test))

Epoch 1/4



INFO:tensorflow:Assets written to: ./bidirectional_model/assets


INFO:tensorflow:Assets written to: ./bidirectional_model/assets


Epoch 2/4



INFO:tensorflow:Assets written to: ./bidirectional_model/assets


INFO:tensorflow:Assets written to: ./bidirectional_model/assets


Epoch 3/4



INFO:tensorflow:Assets written to: ./bidirectional_model/assets


INFO:tensorflow:Assets written to: ./bidirectional_model/assets


Epoch 4/4


In [28]:
def eval_metrics(actual, prediction):
    print("Accuracy Score: {}".format(accuracy_score(actual, prediction)))
    print("Recall Score: {}".format(recall_score(actual, prediction)))
    print("f1 Score: {}".format(f1_score(actual, prediction)))
print("Test set:")
model1 = load_model("./bidirectional_model/")
pred = model1.predict(x_test)
pred = (pred >= 0.5)
eval_metrics(y_test, pred)
print("\n Train set:")
predTrain = model1.predict(x_train)
predTrain = (predTrain >= 0.5)
eval_metrics(y_train, predTrain)

Test set:
Accuracy Score: 0.9737136465324385
Recall Score: 0.5764705882352941
f1 Score: 0.6758620689655171

 Train set:
Accuracy Score: 0.993661446681581
Recall Score: 0.9014084507042254
f1 Score: 0.9324503311258279


## SVM Predict
we get a SVM as the last step predicting, see if we can get higher f1 score by this.
This SVM will use kernel we choose to optimize the outcome, while given all parameters in our cleaned dataframe and the LSTM prediction value

In [29]:
x = CleanDf.drop(["fraudulent"],axis = 1)
y = CleanDf['fraudulent']
x_train1 = x_train_w.drop(["text"],axis=1)
x_train1['LSTM'] = model1.predict(x_train)
x_test1 = x_test_w.drop(["text"],axis=1)
x_test1['LSTM'] = model1.predict(x_test)
y_train1 = y_train
y_test1 = y_test
print(x_train1.shape, x_test1.shape, y_train1.shape, y_test1.shape)



In [30]:
svc=SVC(kernel='poly') 
svc.fit(x_train1,y_train1)
y_pred=svc.predict(x_test1)
eval_metrics(y_test1, y_pred)

Accuracy Score: 0.9776286353467561
Recall Score: 0.5764705882352941
f1 Score: 0.7101449275362318


In [31]:
# Training score
y_pred=svc.predict(x_train1)
eval_metrics(y_train1, y_pred)

Accuracy Score: 0.9909271687795178
Recall Score: 0.8220230473751601
f1 Score: 0.8979020979020979


In [34]:
from sklearn import metrics
print("area under curve (auc): ", metrics.roc_auc_score(y_train1, y_pred))

area under curve (auc):  0.9107829298661444
