In [4]:
import time
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample 
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import remove_stopwords
from IPython.core.debugger import set_trace
import warnings

sys.path.insert(0, '..')
from assignment8.my_evaluation import my_evaluation

In [5]:
class my_model():
    def fit(self, X, y):
        # do not exceed 29 mins
        self.y_data_class = y
        X = self.clean_training_data(X)
        
        X_fraud = X.loc[y == 1]
        y_fraud = y.loc[y == 1]

        X_real = X.loc[y == 0]
        y_real = y.loc[y == 0]

        X_real = resample(X_real, n_samples=len(X_fraud))
        y_real = resample(y_real, n_samples=len(y_real))

        X_balanced = pd.concat([X_real, X_fraud])
        Y_balanced = pd.concat([y_real, y_fraud])
        
        self.preprocessor = TfidfVectorizer(stop_words='english', norm='l2', use_idf=False, smooth_idf=False)
        XX_desc = self.preprocessor.fit_transform(X["description"])
        
        XX_req = self.preprocessor.fit_transform(X["requirements"])
        
        df1 = pd.DataFrame(XX_desc.toarray())
        df2 = pd.DataFrame(XX_req.toarray())
        
        self.clf = PassiveAggressiveClassifier(class_weight="balanced", random_state=10, C = 0.5)
        self.clf.fit(XX, y)
        return

    def vectorize(self, X, y):
        description_X = X.description
        X_train, X_test, y_train, y_test = train_test_split(description_X, y, test_size = 0.33, shuffle = True)
        count_vec = CountVectorizer().fit(X_train)
        
        X_desc_train = pd.DataFrame(count_vec.transform(X_train).todense(),
                                   columns = count_vec.get_feature_names())
        
        X_desc_test = pd.DataFrame(count_vec.transform(X_test).todense(),
                                   columns = count_vec.get_feature_names())
        
        
        requirement_X = X.requirements
        X_train, X_test, y_train, y_test = train_test_split(requirement_X, y, test_size = 0.33, shuffle = True)
        X_req_train = pd.DataFrame(count_vec.transform(X_train).todense(),
                                   columns = count_vec.get_feature_names())
        
        X_req_test = pd.DataFrame(count_vec.transform(X_test).todense(),
                                   columns = count_vec.get_feature_names())
        
                
        #concatenate all the vectorized data frames 
        training = pd.concat([X_desc_train, X_req_train], axis=1)
        testing = pd.concat([X_desc_test,X_req_test], axis=1)
        
        return training, testing
        
        
        
        
    def predict(self, X):
        # remember to apply the same preprocessing in fit() on test data before making predictions
        X = self.clean_training_data(X)
        XX = self.preprocessor.transform(X["description"])
        predictions = self.clf.predict(XX)
        return predictions
    
    def clean_training_data(self, data_frame):
        warnings.filterwarnings(action='ignore')
        #fillna to location column
        data_frame['location'] = data_frame.location.fillna('none')

        #fillna to description column
        data_frame['description'] = data_frame.description.fillna('not specified')

        #fillna to requirements column
        data_frame['requirements'] = data_frame.description.fillna('not specified')
        
        #drop unnecassary columns
        data_frame.drop(['telecommuting','has_questions'],axis = 1, inplace = True)  
        
        #mapping fraudulent to T and F, where there is  0 and 1 respectively
        data_frame['has_company_logo'] = data_frame.has_company_logo.map({1 : 't', 0 : 'f'})
        
        #remove any unnecassary web tags in the data set
        data_frame['title'] = data_frame.title.str.replace(r'<[^>]*>', '')
        data_frame['description'] = data_frame.description.str.replace(r'<[^>]*>', '')
        data_frame['requirements'] = data_frame.requirements.str.replace(r'<[^>]*>', '')
        
        
        # removing the characters in data set that are not words and has white spaces 
        for column in data_frame.columns:
            data_frame[column] = data_frame[column].str.replace(r'\W', ' ').str.replace(r'\s$','')
            
        
        # mapping back the columns to original binary values
        #data_frame['has_company_logo'] = data_frame.has_company_logo.map({'t': 1, 'f':0})
        
        self.all_genism_stop_words = STOPWORDS
        
        text_columns = list(data_frame.columns.values)
        
        for columns in text_columns:
            self.remove_stopwords_from_data_train(data_frame,columns)
        
        return data_frame
    
    def remove_stopwords_from_data_train(self,data_frame, column_name):
        data_frame[column_name] = data_frame[column_name].apply(lambda x: " ".join([i for i in x.lower().split() if i not in self.all_genism_stop_words]))


In [6]:
def test(data):
    y = data["fraudulent"]
    X = data.drop(['fraudulent'], axis=1)
    split_point = int(0.8 * len(y))
    X_train = X.iloc[:split_point]
    X_test = X.iloc[split_point:]
    y_train = y.iloc[:split_point]
    y_test = y.iloc[split_point:]
    clf = my_model()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    eval = my_evaluation(predictions, y_test)
    f1 = eval.f1(target=1)
    return f1


if __name__ == "__main__":
    start = time.time()
    # Load data
    data = pd.read_csv("../data/job_train.csv")
    # Replace missing values with empty stringsF1 score: 0.699387
    data = data.fillna("")
    f1 = test(data)
    print("F1 score: %f" % f1)
    runtime = (time.time() - start) / 60.0
    print(runtime)


TypeError: cannot concatenate object of type '<class 'scipy.sparse.csr.csr_matrix'>'; only Series and DataFrame objs are valid

In [None]:
X_fraud = X.loc[y == 1]
y_fraud = y.loc[y == 1]

X_real = X.loc[y == 0]
y_real = y.loc[y == 0]

X_real = resample(X_real, n_samples=len(X_fraud))
y_real = resample(y_real, n_samples=len(y_real))

X_balanced = pd.concat([X_real, X_fraud])
Y_balanced = pd.concat([y_real, y_fraud])