In [108]:
import os
import numpy as np
import pandas as pd
import random
import pyarrow.parquet as pq
from pyspark import SparkContext
import math
# feature selection
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
# algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate
# SMOTE
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Part1. Feature Selection
``` 1. Input: randomly select 300 files from 3000 files to perform feature selection. ```

The output file is called 'TrueData_rand_select.csv'

In [32]:
def random_select(save_path,n_file_select,output):
    np.random.seed(0)
    file_len=0
    filenames=[]
    filenames=[filename for filename in os.listdir(save_path) if filename.endswith(".parquet")]
    selected_file=np.random.choice(filenames, n_file_select)
    print(f'randomly select {len(selected_file)} files out of {len(filenames)} files')
    
    first=True
    for filename in os.listdir(save_path):
        if filename in selected_file: 
            path=save_path+'/'+filename
            if first:
                tb=pq.read_table(path).to_pandas()
                first=False
            else:
                tb=tb.append(pq.read_table(path).to_pandas())  
    tb.to_csv(output)

random_select('/Users/pushin/Desktop/td_dataset',300,'TrueData_rand_select.csv')

```2. Filter & Wrapper Method```

The input file is called 'TrueData_rand_select.csv'

a. Filter - ChiSquare test
https://machinelearningmastery.com/feature-selection-with-categorical-data/

In [38]:
def filter_method(input_file,threshold):
    rand_df=pd.read_csv(input_file)
    X,y=rand_df.loc[:,'&PIZZA_180':],rand_df['desired_outcome']

    fs1 = SelectKBest(score_func=chi2, k='all')
    fs1.fit(X, y)
    feature_score=sorted(list(zip(X.head().columns,fs1.scores_)),key=lambda x: x[1],reverse=True)
    indep_feat=[k for k,v in feature_score if v>threshold]
    fea_n=len(indep_feat)
    print(fea_n)
    fs = SelectKBest(score_func=chi2, k=fea_n)
    fs.fit(X, y)
    X_fs = fs.transform(X)
    print(X_fs.shape)
    return X_fs,y,indep_feat
X_fs,y,indep_feat=filter_method('TrueData_rand_select.csv',60)

216
(92078, 216)


b. Wrapper - Forward Selection
https://www.kdnuggets.com/2018/06/step-forward-feature-selection-python.html

In [222]:
def Wrapper_method(X_fs,y,n_features,metric,indep_feat):
    # Build RF classifier to use in feature selection
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    # Build step forward feature selection
    sfs3 = sfs(clf,
               k_features=n_features,
               forward=True,
               floating=False,
               verbose=2,
               scoring=metric,
               cv=3)

    # Perform SFFS
    sfs3 = sfs3.fit(X_fs, y)
    
    feat_cols=list(sfs3.k_feature_idx_)
    selected_feat=list(np.array(indep_feat)[feat_cols])+['desired outcome']
    return selected_feat
    
selected_feat=Wrapper_method(X_fs,y,40,'recall',indep_feat)
print(selected_feat)

['APP201882', 'APP201971', 'APP203911', 'APP205410', 'APP205601', 'APP207449', 'APP207825', 'APP208412', 'BEST_MASSAGE_INTERNATIONAL_90', 'ATLANTA_CUSTOM_SHIRT_90', 'CAVE_CREEK_GOLF_COURSE_180', 'CORNER_BAKERY_CAFE_90', 'EASTMORELAND_GOLF_COURSE_90', 'DELAWARE_CANAL_STATE_PARK_30', 'CHARLOTTE_COUNTRY_CLUB_90', 'EARTHBOUND_TRADING_CO__90', 'DALLAS_NATIONAL_GOLF_CLUB_180', 'FRESHII_30', 'FOUR_WINDS_CARRIAGE_COMPANY_30', 'GEORGIA_CAPITOL_MUSEUM_90', 'GOODWILL_INDUSTRIES_90', 'HRD100031', 'HRD100101', 'JACKSON_HEWITT_TAX_SERVICE_90', 'MRG100044', 'PRD000006', 'PILOT_FLYING_J_180', 'SPEEDWAY_180', 'STATE_GA', 'STATE_VA', 'STATE_WA', 'STATE_OR', 'STATE_NE', 'STATE_SC', 'STATE_AR', 'STATE_NV', 'STATE_MS', 'SUNOCO_180', 'TARGET_90', 'TD_BANK_30']


In [None]:
def feat_data_extract(save_path):
    first=1
    files=os.listdir(save_path)
    for filename in files:

        if filename.endswith(".parquet"): 
            path=save_path+'/'+filename
            if first==1:
                tb=pq.read_table(path)
                tb=tb.select([c for c in tb.to_pandas().columns if c in selected_feat]).to_pandas()
                first=0
            else:
                new=pq.read_table(path).select([c for c in tb.columns if c in selected_feat]).to_pandas()
                tb=tb.append(new)
    return tb
feat_data=feat_data_extract('/Users/pushin/Desktop/td_dataset')
feat_data.to_csv('data_recall40.csv')

# Part2/3. Model Building/Evaluation
```1. Input: 3000 files with selected features to perform model building.```

The input file is called 'data_recall40.csv'

In [209]:
df=pd.read_csv('data_recall40.csv').iloc[:, 1:]
l = list(df.columns)
l.remove('desired_outcome')
print(len(l))
X=df.loc[:,l]
y=df.loc[:,'desired_outcome']

40


```2. Create the model list and evaluation function```

In [210]:
# Create model list
model_ls=[LogisticRegression(solver='lbfgs'),DecisionTreeClassifier()]
for depth in [2,3,4]:
    for estimator in [100,200,500,1000]:
        RF = RandomForestClassifier(n_estimators=estimator, random_state=42, max_depth= depth)
        model_ls.append(RF)

def model_evaluation(X,y,n_cv,model_ls):
    precision_ls,recall_ls,accuracy_ls=[],[],[]
    for m in model_ls:
        scores = cross_validate(m, X, y, scoring=['recall','precision','accuracy'],cv=n_cv)
        recall_ls.append(sum(scores['test_recall'])/n_cv)
        precision_ls.append(sum(scores['test_precision'])/n_cv)
        accuracy_ls.append(sum(scores['test_accuracy'])/n_cv)
        
    df = pd.DataFrame()
    df['model'],df['recall'],df['precision'],df['accuracy']= model_ls,recall_ls,precision_ls,accuracy_ls
    return df

- Data without preprocessing

In [211]:
df_without_preprocessing = model_evaluation(X,y,3,model_ls)
df_without_preprocessing

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Unnamed: 0,model,recall,precision,accuracy
0,LogisticRegression(),0.046888,0.571172,0.942871
1,DecisionTreeClassifier(),0.05156,0.591388,0.943114
2,"RandomForestClassifier(max_depth=2, random_sta...",0.0,0.0,0.942193
3,"RandomForestClassifier(max_depth=2, n_estimato...",0.0,0.0,0.942193
4,"RandomForestClassifier(max_depth=2, n_estimato...",0.0,0.0,0.942193
5,"RandomForestClassifier(max_depth=2, n_estimato...",0.0,0.0,0.942193
6,"RandomForestClassifier(max_depth=3, random_sta...",3.6e-05,0.666667,0.942195
7,"RandomForestClassifier(max_depth=3, n_estimato...",3.6e-05,0.666667,0.942195
8,"RandomForestClassifier(max_depth=3, n_estimato...",9.1e-05,1.0,0.942198
9,"RandomForestClassifier(max_depth=3, n_estimato...",0.000127,1.0,0.9422


In [218]:
df_without_preprocessing.to_csv('df_without_preprocessing.csv')

- Random Undersampling

In [212]:
def partition (list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]

X=df.loc[:,l]
y=df.loc[:,'desired_outcome']
ind=list(X.index)

first_batch=partition (ind, 3)[0]
second_batch=partition (ind, 3)[1]
third_batch=partition (ind, 3)[2]

In [213]:
X_train_ls=[X.iloc[first_batch+second_batch,:],X.iloc[first_batch+third_batch,:],X.iloc[second_batch+third_batch,:]]
X_test_ls=[X.iloc[third_batch,:],X.iloc[second_batch,:],X.iloc[first_batch,:]]
y_df=pd.DataFrame(y)
y_train_ls=[y_df.iloc[first_batch+second_batch,:]['desired_outcome'],y_df.iloc[first_batch+third_batch,:]['desired_outcome'],y_df.iloc[second_batch+third_batch,:]['desired_outcome']]
y_test_ls=[y_df.iloc[third_batch,:]['desired_outcome'],y_df.iloc[second_batch,:]['desired_outcome'],y_df.iloc[first_batch,:]['desired_outcome']]

split_dic={'X_train_ls':X_train_ls,'X_test_ls':X_test_ls,'y_train_ls':y_train_ls,'y_test_ls':y_test_ls}

In [214]:
def Undersampling(split_dic,model_ls):
    precision_ls,recall_ls,accuracy_ls=[],[],[]
    for model in model_ls:
        p_ls,r_ls,a_ls=[],[],[]
        for i in range(3):
            X_train, X_test, y_train, y_test = split_dic['X_train_ls'][i],split_dic['X_test_ls'][i],split_dic['y_train_ls'][i],split_dic['y_test_ls'][i]
            # selected= 1 index + selected 0 index
            ind_ls_zeros=list(y_train[y_train==0].index)
            num_ones=y_train.sum()
            rand_ind_zeros=random.sample(ind_ls_zeros,num_ones) # y_train.sum() = 49801
            selected_sample=list(y_train[y_train==1].index)+rand_ind_zeros
            balance_X_train=X_train.loc[selected_sample,:]
            balance_y_train=pd.DataFrame(y_train).loc[selected_sample,:]
            model.fit(balance_X_train, balance_y_train)
            y_pred = model.predict(X_test)
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
            precision=round(tp/(tp+fp),2)
            recall=round(tp/(fn+tp),2)
            accuracy=round((tp+tn)/(tn+fp+fn+tp),2)
            p_ls.append(precision)
            r_ls.append(recall)
            a_ls.append(accuracy)

        precision_ls.append(sum(p_ls)/3)
        recall_ls.append(sum(r_ls)/3)
        accuracy_ls.append(sum(a_ls)/3)
    df= pd.DataFrame()
    df['model'],df['recall'],df['precision'],df['accuracy']= model_ls,recall_ls,precision_ls,accuracy_ls
    return df

In [215]:
df_Undersampling = Undersampling(split_dic,model_ls)
df_Undersampling

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  


Unnamed: 0,model,recall,precision,accuracy
0,LogisticRegression(),0.433333,0.21,0.873333
1,DecisionTreeClassifier(),0.43,0.213333,0.873333
2,"(DecisionTreeClassifier(max_depth=2, max_featu...",0.38,0.22,0.88
3,"(DecisionTreeClassifier(max_depth=2, max_featu...",0.383333,0.216667,0.883333
4,"(DecisionTreeClassifier(max_depth=2, max_featu...",0.38,0.22,0.883333
5,"(DecisionTreeClassifier(max_depth=2, max_featu...",0.376667,0.22,0.883333
6,"(DecisionTreeClassifier(max_depth=3, max_featu...",0.38,0.22,0.88
7,"(DecisionTreeClassifier(max_depth=3, max_featu...",0.386667,0.213333,0.88
8,"(DecisionTreeClassifier(max_depth=3, max_featu...",0.386667,0.22,0.883333
9,"(DecisionTreeClassifier(max_depth=3, max_featu...",0.386667,0.216667,0.88


In [219]:
df_Undersampling.to_csv('df_Undersampling.csv')

- SMOTE: Oversample then undersample

In [216]:
def SMOTE_Over_Under(split_dic,model_ls):
    precision_ls,recall_ls,accuracy_ls=[],[],[]
    for model in model_ls:
        p_ls,r_ls,a_ls=[],[],[]
        for i in range(3):
            X_train, X_test, y_train, y_test = split_dic['X_train_ls'][i],split_dic['X_test_ls'][i],split_dic['y_train_ls'][i],split_dic['y_test_ls'][i]
            over = SMOTE(sampling_strategy=0.1)
            under = RandomUnderSampler(sampling_strategy=0.5)
            steps = [('o', over), ('u', under)]
            pipeline = Pipeline(steps=steps)
            # transform the dataset
            X_smote2, y_smote2 = pipeline.fit_resample(X_train,y_train)

            model.fit(X_smote2, y_smote2)
            y_pred = model.predict(X_test)
            # print(confusion_matrix(y_test, y_pred))
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
            precision=round(tp/(tp+fp),2)
            recall=round(tp/(fn+tp),2)
            accuracy=round((tp+tn)/(tn+fp+fn+tp),2)

            p_ls.append(precision)
            r_ls.append(recall)
            a_ls.append(accuracy)

        precision_ls.append(sum(p_ls)/3)
        recall_ls.append(sum(r_ls)/3)
        accuracy_ls.append(sum(a_ls)/3)
    df= pd.DataFrame()
    df['model'],df['recall'],df['precision'],df['accuracy']= model_ls,recall_ls,precision_ls,accuracy_ls
    return df

In [None]:
df_SMOTE_over_under=SMOTE_Over_Under(split_dic,model_ls)
df_SMOTE_over_under