In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import remove_stopwords
from IPython.core.debugger import set_trace

In [None]:
class my_model():
    def fit(self, X, y):
        # do not exceed 29 mins
        predictions = []
        pred_proba = []
        self.X_data = X
        self.Y_data = y
        
        x_train, x_test, y_train, y_test = self.pre_process(X,y)
        self.tree = DecisionTreeClassifier(max_depth = 2) 
        self.tree.fit(x_train, y_train) 
        print("Score:",self.tree.score(x_test,y_test))

        return

    def predict(self, X):
        # remember to apply the same preprocessing in fit() on test data before making predictions
        x_train, x_test, y_train, y_test = self.pre_process(X,self.Y_data)
        self.tree.fit(x_train, y_train)
        predictions = self.tree.predict(x_test)
        return predictions

    def pre_process(self, X, y):
        
        #fillna to location column
        X['location'] = X.location.fillna('none')

        #fillna to description column
        X['description'] = X.description.fillna('not specified')

        #fillna to requirements column
        X['requirements'] = X.description.fillna('not specified')

        #drop unnecassary columns
        #X.drop(['telecommuting','has_questions'],axis = 1, inplace = True)
        
        #mapping has_company_logo to T and F, where there is  0 and 1 respectively
        X['has_company_logo'] = X.has_company_logo.map({1 : 't', 0 : 'f'})

        #remove any unnecassary web tags in the data set
        X['title'] = X.title.str.replace(r'<[^>]*>', '')
        X['description'] = X.description.str.replace(r'<[^>]*>', '')
        X['requirements'] = X.requirements.str.replace(r'<[^>]*>', '')
        
        # removing the characters in data set that are not words and has white spaces 
        for column in X.columns:
            X[column] = X[column].str.replace(r'\W', ' ').str.replace(r'\s$','')
        
        #storing all the STOPWORDS
        self.all_gensim_stop_words = STOPWORDS
        
        #store all the independent columns in a list
        text_columns = list(X.columns.values)
        
        # clean all the columns by removing the STOPWORDS
        for columns in text_columns:
            self.clean_all_columns(X, columns)
        
        #combine the data into one column for a maneagable size
        X['text']= X['title']+' '+X['location']+' '+X['description']+' '+X['requirements']+' '+X['has_company_logo']
        X.drop(['title','location','description','requirements','has_company_logo'],axis = 1, inplace = True)
        
        #split the data
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,shuffle = True,
                                                            random_state = 7)
        
        #vectorize the x- training and testing data set
        vectorizer = TfidfVectorizer()
        
        # converting to type U for unicode standards
        x_train_vector = vectorizer.fit_transform(x_train.astype('U'))
        x_test_vector = vectorizer.transform(x_test.astype('U'))
 
        
        return x_train_vector, x_test_vector, y_train, y_test
    
    def clean_all_columns(self, X, column_name):
        X[column_name] = X[column_name].apply(lambda x: " ".join([i for i in x.lower().split() if i not in self.all_gensim_stop_words]))

In [None]:

start = time.time()

# Load data
data = pd.read_csv("../data/job_train.csv")

# Replace missing values with empty strings
data = data.fillna("")

y = data["fraudulent"]
X = data.drop(['fraudulent'], axis=1)

#drop unnecassary columns
X.drop(['telecommuting','has_questions'],axis = 1, inplace = True)

# Train model
clf = my_model()
clf.fit(X, y)

runtime = (time.time() - start) / 60.0
print("this is run time:",runtime)

# predictions = clf.predict(X)
# print(predictions)