In [1]:
import time
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample 
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import remove_stopwords
from IPython.core.debugger import set_trace
import warnings

sys.path.insert(0, '..')
from assignment8.my_evaluation import my_evaluation

In [2]:
class my_model():
    def fit(self, X, y):
        # do not exceed 29 mins
        self.y_data_class = y
        X = self.clean_training_data(X)
        
        self.preprocessor = TfidfVectorizer(stop_words='english', norm='l2', use_idf=False, smooth_idf=False)
        XX = self.preprocessor.fit_transform(X["description"])
        
        clf = AdaBoostClassifier()
        abc_grid = {'n_estimators':[20,50,75,100,500],
                   'learning_rate':[.001,.01,.1]}
        self.search = RandomizedSearchCV(clf, abc_grid, random_state=0, n_jobs=-1, verbose=5)
        
        self.search.fit(XX, y)
        return

    def predict(self, X):
        # remember to apply the same preprocessing in fit() on test data before making predictions
        X = self.clean_training_data(X)
        XX = self.preprocessor.transform(X["description"])
        predictions = self.clf.predict(XX)
        return predictions
    
    def clean_training_data(self, data_frame):
        warnings.filterwarnings(action='ignore')
#         print(data_frame.columns)
#         set_trace()
        #fillna to location column
        data_frame['location'] = data_frame.location.fillna('none')

        #fillna to description column
        data_frame['description'] = data_frame.description.fillna('not specified')

        #fillna to requirements column
        data_frame['requirements'] = data_frame.description.fillna('not specified')
        
        #drop unnecassary columns
        data_frame.drop(['telecommuting','has_questions'],axis = 1, inplace = True)  
        
        #mapping fraudulent to T and F, where there is  0 and 1 respectively
        data_frame['has_company_logo'] = data_frame.has_company_logo.map({1 : 't', 0 : 'f'})
        
        #remove any unnecassary web tags in the data set
        data_frame['title'] = data_frame.title.str.replace(r'<[^>]*>', '')
        data_frame['description'] = data_frame.description.str.replace(r'<[^>]*>', '')
        data_frame['requirements'] = data_frame.requirements.str.replace(r'<[^>]*>', '')
        
        
        # removing the characters in data set that are not words and has white spaces 
        for column in data_frame.columns:
            data_frame[column] = data_frame[column].str.replace(r'\W', ' ').str.replace(r'\s$','')
            
        
        # mapping back the columns to original binary values
        #data_frame['has_company_logo'] = data_frame.has_company_logo.map({'t': 1, 'f':0})
        
        self.all_genism_stop_words = STOPWORDS
        
        text_columns = list(data_frame.columns.values)
        
        for columns in text_columns:
            self.remove_stopwords_from_data_train(data_frame,columns)
            
        # as 1 and 0 values in the fraudulent class is highly unbalanced
        # true = 0 and fake = 1
        # 0 : 1 == 8484 : 456
#         X_Class_1 = data_frame.loc[y_data_frame == 1]
#         X_Class_0 = data_frame.loc[y_data_frame == 0]

#         Class_0_count, Class_1_count = y_data_frame.value_counts()

#         Class_0_undersampling = X_Class_0.sample(Class_1_count)    
#         #print(Class_0_undersampling.shape)
#         #set_trace()

#         data_frame_undersample = pd.concat([Class_0_undersampling, X_Class_1], axis=0)
        
        
#         y_Class_1 = y_data_frame.loc[y_data_frame == 1]
#         y_Class_0 = y_data_frame.loc[y_data_frame == 0]
        
#         Class_0_count, Class_1_count = y_data_frame.value_counts()

#         Y_Class_0_undersampling = y_Class_0.sample(Class_1_count)    

#         y_data_frame_undersample = pd.concat([Y_Class_0_undersampling, y_Class_1], axis=0)
        
        
        

        return data_frame
    
    def remove_stopwords_from_data_train(self,data_frame, column_name):
        data_frame[column_name] = data_frame[column_name].apply(lambda x: " ".join([i for i in x.lower().split() if i not in self.all_genism_stop_words]))


In [8]:
def test(data):
    y = data["fraudulent"]
    X = data.drop(['fraudulent'], axis=1)
    split_point = int(0.8 * len(y))
    X_train = X.iloc[:split_point]
    X_test = X.iloc[split_point:]
    y_train = y.iloc[:split_point]
    y_test = y.iloc[split_point:]
    clf = my_model()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    eval = my_evaluation(predictions, y_test)
    f1 = eval.f1(target=1)
    return f1


if __name__ == "__main__":
    start = time.time()
    # Load data
    data = pd.read_csv("../data/job_train.csv")
    # Replace missing values with empty stringsF1 score: 0.699387
    data = data.fillna("")
    f1 = test(data)
    print("F1 score: %f" % f1)
    runtime = (time.time() - start) / 60.0
    print(runtime)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   56.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 16.2min finished


AttributeError: 'my_model' object has no attribute 'clf'