In [76]:
import pandas as pd
import numpy as np
from typing import Dict, Tuple
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
import sys
import os
import nltk
import csv
import string
import re
from dateutil.parser import parse
from sklearn import svm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simranbawkar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/simranbawkar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [86]:
class RuleAugmentedRFE(RandomForestClassifier):
    def __init__(self, base_model: RandomForestClassifier, rules: Dict, **base_params):
        self.base_model = base_model
        self.rules = rules
        self.base_model.set_params(**base_params)
        
    def __repr__(self):
        return "Rule Augmented Estimator:\n\n\t Base Model: {}\n\t Rules: {}".format(self.base_model, self.rules)

    def __str__(self):
         return self.__str__

        
    def fit(self, X:pd.DataFrame, y:pd.Series, **kwargs):
        train_x, train_y = self.getBaseModelData(X,y)
        self.base_model.fit(train_x, train_y, **kwargs)
        print('model fit')
    
    def getBaseModelData(self, X:pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
        train_x = X
        for category,rules in self.rules.items():
            if category not in train_x.columns.values:
                continue
            for rule in rules:
                if rule[0] == "=":
                    train_x = train_x.loc[train_x[category]!=rule[1]]
                else :
                    print("Invalid rule detected {}".format(rule))
        indices = train_x.index.values
        print(indices)
        print(y)
        train_y = y.loc[indices]
        train_x.reset_index(drop=True)
        train_y.reset_index(drop=True)
        return train_x, train_y
    
    def predict(self, X:pd.DataFrame) -> np.array:
        p_X = X.copy()
        p_X['prediction'] = np.nan
        for category, rule in self.rules.items():
            if category not in X.columns.values: continue
            for rule in rules:
                if rule[0] == "=":
                    p_X.loc[p_X[category]==rule[1], 'prediction'] = rule[2]
                elif rule[0] == '<' :
                    p_X.loc[p_X[category] < rule[1], 'prediction'] = rule[2]
                elif rule[0] == '<=' :
                    p_X.loc[p_X[category] <= rule[1], 'prediction'] = rule[2]
        if len(p_X.loc[p_X['prediction'].isna()].index != 0):
            base_X = p_X.loc[p_X['prediction'].isna()].copy()
            base_X.drop('prediction', axis=1, inplace=True)
            p_X.loc[p_X['prediction'].isna(), 'prediction'] = self.base_model.predict(base_X)
            return p_X['prediction'].values  
        
    def get_params(self, deep: bool = True) -> Dict:
        """Return the model's and base model's parameters.
        Args:
            deep: Whether to recursively return the base model's parameters.
        Returns
            Dict: The model's parameters.
        """
        
        params = {'base_model': self.base_model,
                  'outcome_range': self.outcome_range,
                  'rules': self.rules
                 }
    
        params.update(self.base_model.get_params(deep=deep))
        return params
    
    def set_params(self, **params):
        """Sets parameters for the model and base model.
        Args:
            **params: Optional keyword arguments.
        """
                  
        parameters = params
        param_keys = parameters.keys()
        
        if 'base_model' in param_keys:
            value = parameters.pop('base_model')
            self.base_model = value
            
        if 'rules' in param_keys:
            value = parameters.pop('rules')
            self.rules = value
        
        self.base_model.set_params(**parameters)
        
        
            

In [132]:
class Features:
    def __init__(self):
        # self.preprocess = preProcess()
        self.feature_list = []
    def isNumeric(self, char):
        import re
        regex = re.findall(r"^[-+]?(?:\d*\.\d+|\d+$)", char)
        if regex:
            return True
        for c in char:
            if not c.isdigit():
                return False
        return True
    def isOnlyAlpha(self, char):
        regex = re.findall(r"[A-Za-z]+$", char)
        if regex:
            return True
        return False
    
    def isBlank(self, each_element):
        if not each_element or each_element == 'UNKNOWN' or each_element.isspace() or each_element == 'NULL' or each_element == ' ' or each_element == '' or each_element == None or each_element == "" or each_element ==" ":
            return True
        return False
    
    def specialCharacters(self,char):
        import re
        regex = re.findall(r'[\w]',char)
        if regex:
            return True
        return False
    

    def isDate(self, string, fuzzy=False):
        """
        Return whether the string can be interpreted as a date.

        :param string: str, string to check for date
        :param fuzzy: bool, ignore unknown tokens in string if True
        """
        try: 
            parse(string, fuzzy=fuzzy)
            return True

        except ValueError:
            return False

    def noSpecialCharacters(self, char):
        noSpecialChars = 0
        noSpecialChars =  len(char) - len( re.findall('[\w]', char) ) + len(re.findall('[-_]+',char))
        return noSpecialChars
    
    def isSpace(self, char):
        if " " in char:
            return True
        return False
    
    def tokenize( self, string, separator = ',', quote = '"' ):
        """
        Split a comma separated string into a List of strings.

        Separator characters inside the quotes are ignored.

        :param string: A string to be split into chunks
        :param separator: A separator character
        :param quote: A character to define beginning and end of the quoted string
        :return: A list of strings, one element for every chunk
        """
        comma_separated_list = []

        chunk = ''
        in_quotes = False

        for character in string:
            if character == separator and not in_quotes:
                comma_separated_list.append(chunk)
                chunk = ''

            else:
                chunk += character
                if character == quote:
                    in_quotes = False if in_quotes else True
        # print(chunk)
        # print(chunk.replace('"',''))
        
        comma_separated_list.append(chunk.replace('"',''))
        return comma_separated_list
    
    def featureList(self, file_name):
        file = open(file_name, 'r')
        lines = file.readlines()
        feature_list = []
        feature_list_vector = []
        line = lines[0]
        each_row_elements = self.tokenize(line)
        num_numerics = 0
        num_alphs = 0
        num_null = 0
        bool_date = 0
        bool_specialChar = 0
        
        for each_element in each_row_elements:
            each_element = each_element.strip('\"')
            if self.isBlank(each_element):
                num_null = 1
            elif not self.isNumeric(each_element):
                num_alphs = 1
            else:
                num_numerics = 1
            bool_specialChar = 1 if self.specialCharacters(each_element) else 0
            bool_space = self.isSpace(each_element)
        self.feature_list.append([num_numerics,num_null,num_alphs,bool_specialChar,bool_space])
        
    def featureVector(self) -> pd.DataFrame:
        # scaler = StandardScaler()
        # self.feature_list = scaler.fit_transform(self.feature_list)
        feature_vector = pd.DataFrame(self.feature_list, columns = ['num_numerics','num_null','num_alphs','bool_specialChar','bool_space'])
        return feature_vector
    
    def constructFeatureFile(self, filename):
        self.featureList(filename)
        df_feature = self.featureVector()
        return df_feature
    
    def returnHeader(self, filename, pred):
        if pred == 1:
            file = open(filename, 'r')
            lines = file.readlines()
            line = lines[0]
            each_row_elements = self.tokenize(line)
            return each_row_elements
        else :
            return []
    
    def constructFeatureVector(self, csvfilename):
        df = pd.read_csv(csvfilename)
        file_names = df['file_name']
        headers = df['has_header']
        header_flags = []
        
        for file_name, header in zip(file_names, headers):
            file = 'data/'+file_name
            self.featureList(file)
            if header == 'yes':
                header_flags.append(1)
            else:
                header_flags.append(0)
        df_feature = self.featureVector()
        df_feature['has_header'] = header_flags
        return df_feature
        
 

In [62]:

data = Features().constructFeatureVector('header_information.csv')

print(data)

     num_numerics  num_null  num_alphs  bool_specialChar  bool_space  \
0               0         0          1                 1       False   
1               0         0          1                 1       False   
2               0         0          1                 1       False   
3               0         0          1                 1       False   
4               1         0          1                 1       False   
..            ...       ...        ...               ...         ...   
395             0         0          1                 1       False   
396             0         0          1                 1       False   
397             0         0          1                 1       False   
398             1         1          1                 0       False   
399             1         1          1                 1        True   

     has_header  
0             1  
1             1  
2             1  
3             1  
4             0  
..          ...  
395      

In [63]:
X = data.drop(columns = ['has_header'])
y = data['has_header']
print(X)

     num_numerics  num_null  num_alphs  bool_specialChar  bool_space
0               0         0          1                 1       False
1               0         0          1                 1       False
2               0         0          1                 1       False
3               0         0          1                 1       False
4               1         0          1                 1       False
..            ...       ...        ...               ...         ...
395             0         0          1                 1       False
396             0         0          1                 1       False
397             0         0          1                 1       False
398             1         1          1                 0       False
399             1         1          1                 1        True

[400 rows x 5 columns]


In [64]:
print(y)

0      1
1      1
2      1
3      1
4      0
      ..
395    1
396    1
397    1
398    0
399    0
Name: has_header, Length: 400, dtype: int64


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [67]:
print(X_train)

     num_numerics  num_null  num_alphs  bool_specialChar  bool_space
261             0         0          1                 1       False
118             0         0          1                 1       False
48              0         0          1                 1       False
156             0         1          1                 0       False
211             0         1          0                 0       False
..            ...       ...        ...               ...         ...
353             0         0          1                 1       False
186             1         1          1                 1        True
111             0         0          1                 1       False
332             0         0          1                 1       False
396             0         0          1                 1       False

[320 rows x 5 columns]


In [49]:
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50, 
                               bootstrap = True,
                               max_features = 'sqrt')
rules = {" num_null": [
                            ("=", 0, 0)
                         ]}
hybrid_model = RuleAugmentedRFE(model, rules)
hybrid_model.fit(X_train, y_train)
predictions = hybrid_model.predict(X_train)

In [None]:
rf_predictions = hybrid_model.predict(X_test)
# Probabilities for each class

print(rf_predictions)

In [None]:
print(np.array(y_test))

In [None]:
help(hybrid_model)

In [98]:
from sklearn.metrics import roc_auc_score

hybrid_model.score(X_test, y_test)

0.975

In [106]:
errors = abs(rf_predictions - y_test)
print('Metrics for Random Forest Trained on Original Data')
print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')
print(y_test)
# Calculate mean absolute percentage error (MAPE)
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(rf_predictions, y_test))
print('Mean Squared Error:', metrics.mean_squared_error(rf_predictions, y_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_predictions)))

Metrics for Random Forest Trained on Original Data
Average absolute error: 0.03 degrees.
169    1
185    0
164    0
250    1
83     1
      ..
221    1
356    1
154    1
66     1
94     1
Name: has_header, Length: 80, dtype: int64
Mean Absolute Error: 0.025
Mean Squared Error: 0.025
Root Mean Squared Error: 0.15811388300841897


In [107]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,rf_predictions))
print(classification_report(y_test,rf_predictions))
print(accuracy_score(y_test, rf_predictions))

[[19  2]
 [ 0 59]]
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        21
           1       0.97      1.00      0.98        59

    accuracy                           0.97        80
   macro avg       0.98      0.95      0.97        80
weighted avg       0.98      0.97      0.97        80

0.975


In [138]:
def extract_header(file_name):
    data = Features().constructFeatureFile(file_name)
    predictions = hybrid_model.predict(data)
    header_list  = Features().returnHeader(file_name, predictions)
    header_flag = "yes" if predictions == 1 else "no"
    return { "has_header": header_flag, 
      "header": header_list} 
    

In [140]:
extract_header('data/058c1376-87a3-4ae3-8b1a-2d2adfb21a33.txt')



{'has_header': 'no', 'header': []}