In [1]:
import csv
import string
import pickle
import textstat
import numpy as np
import pandas as pd

In [2]:
from tqdm import tqdm
from langdetect import detect
from lingfeat import extractor #from lingfeatBASE.lingfeat import extractor
from spellchecker import SpellChecker

In [3]:
data = pd.read_pickle("Data/AllQueries4746.p") 
data

Unnamed: 0,query,class
0,US civil war causes,0
1,scooter brands,0
2,scooter brands reliable,0
3,scooter,0
4,scooter cheap,0
...,...,...
4741,House of dreams,1
4742,When did Desmond doss get married,1
4743,H,1
4744,find fact about dog,1


In [4]:
all_queries = data.copy()

In [10]:
def lingFeatExtract(queries):
    
    all_queries = queries
    
    # Pass the text into an extractor
    all_queries['extract'] = ''
    for i in range(len(all_queries)):
        all_queries['extract'][i] = extractor.pass_text(str(all_queries['query'][i]))
        
    # -- Discourse Feat ---
    
    # Extract Entity Density Features (EnDF_)
    all_queries['EnDF_'] = ''
    for i in range(len(all_queries)):
        all_queries['EnDF_'][i] = all_queries['extract'][i].preprocess()
        
    #  Extract entity Grid Features (EnGF_)
    EnGF_Def = {'ra_SSToT_C':0,'ra_SOToT_C':0,'ra_SXToT_C':0,'ra_SNToT_C':0,'ra_OSToT_C':0,'ra_OOToT_C':0,'ra_OXToT_C':0,'ra_ONToT_C':0,'ra_XSToT_C':0,'ra_XOToT_C':0,'ra_XXToT_C':0,'ra_XNToT_C':0,'ra_NSToT_C':0,'ra_NOToT_C':0,'ra_NXToT_C':0,'ra_NNToT_C':0,'LoCohPA_S':0,'LoCohPW_S':0,'LoCohPU_S':0,'LoCoDPA_S':0,'LoCoDPW_S':0,'LoCoDPU_S':0}
    all_queries['EnGF_'] = ''
    for i in range(len(all_queries)):
        try:
            all_queries['EnGF_'][i] = all_queries['extract'][i].EnGF_()
        except:
            all_queries['EnGF_'][i] = EnGF_Def
            
    # ----- Syntactic -----
            
    # Extract Phrasal Features  (PhrF_)      
    all_queries['PhrF_'] = ''
    for i in range(len(all_queries)):
        all_queries['PhrF_'][i] = all_queries['extract'][i].PhrF_()
        
    # Extract Tree Structure Features (TrSF_)
    all_queries['TrSF_'] = ''
    for i in range(len(all_queries)):
        all_queries['TrSF_'][i] = all_queries['extract'][i].TrSF_()
    
    # Extract Part-of-Speech Features (POSF_)
    all_queries['POSF_'] = ''
    for i in range(len(all_queries)):
        all_queries['POSF_'][i] = all_queries['extract'][i].POSF_()
    
    # ----- Lexico Semantic ------
    
    # Extract Variation Ratio Features (VarF_)
    all_queries['VarF_'] = ''
    for i in range(len(all_queries)):
        all_queries['VarF_'][i] = all_queries['extract'][i].VarF_()
    
    # Extract Type Token Ratio Features (TTRF_)
    TTRF_def = {'SimpTTR_S':1, 'CorrTTR_S':1, 'BiLoTTR_S':0, 'UberTTR_S':0, 'MTLDTTR_S':0.72}
    all_queries['TTRF_'] = ''
    for i in range(len(all_queries)):
        try:
            all_queries['TTRF_'][i] = all_queries['extract'][i].TTRF_()
        except:
            all_queries['TTRF_'][i] = TTRF_def

    # Extract Psycholinguistic Features (PsyF_)
    all_queries['PsyF_'] = ''
    for i in range(len(all_queries)):
        all_queries['PsyF_'][i] = all_queries['extract'][i].PsyF_()
    
    # Extract Word Familiarity (WorF_)
    all_queries['WorF_'] = ''
    for i in range(len(all_queries)):
        all_queries['WorF_'][i] = all_queries['extract'][i].WorF_()
    
    # ----- Shallow Traditional -----
    
    # Extract Shallow Features (ShaF_)
    all_queries['ShaF_'] = ''
    for i in range(len(all_queries)):
        all_queries['ShaF_'][i] = all_queries['extract'][i].ShaF_()
    
    # Extract Traditional Formulas (TraF_)
    all_queries['TraF_'] = ''
    for i in range(len(all_queries)):
        all_queries['TraF_'][i] = all_queries['extract'][i].TraF_()

    return all_queries
    

In [None]:
all_queries = lingFeatExtract(all_queries)

In [None]:
#***** LINGFEAT (Disco, Synt, LexSem, ShalTrad) **********

def lingFeatDictExtract(queries): #--- this query_list parameter is for fn synstax, it won't be as the feat are extracted

    #-- queries and extracted lingFeat  
    lingFeat_data = pd.read_csv('../Data/castsventrecSQS_All_lingFeat_unstructured_new.csv') 

    # The following function extracts transform the key of dictonaries into the columns of a 
    # dataframe and their corresponding values into their corresponding entries

    def feat_extract(dataName, featName):
        """ 
        Steps:
        1. Get the keys of a dict to be used as columns in this dataframe.
        2. Initially the dictionally are stings. eval() and .replace() are used to convert the str into a dict

        """
        cols = eval(dataName[featName][0].replace("'", "\""))
        df_tot = pd.DataFrame(columns = list(cols.keys()))
        for i in range(len(dataName)):
            f = eval(dataName[featName][i].replace("'", "\""))
            val = np.array(list(f.values())).reshape(-1,1).T # reshape the dict value to become the column entries
            df = pd.DataFrame(data = val, columns = list(f.keys()))
            df_tot = pd.concat([df_tot, df])

        return df_tot

    # preprocess
    preprocess = feat_extract(lingFeat_data,'preprocess')
    # Discourse (Disco)
    EntityDensityF = feat_extract(lingFeat_data,'EnDF_')
    EntityGridF = feat_extract(lingFeat_data,'EnGF_')
    # Syntactic (Synta)
    PhrasalF = feat_extract(lingFeat_data,'PhrF_')
    TreeStructureF = feat_extract(lingFeat_data,'TrSF_')
    PartOfSpeechF = feat_extract(lingFeat_data,'POSF_')
    # Lexico Semantic (LxSem)
    TypeTokenRatioF = feat_extract(lingFeat_data,'TTRF_')
    VariationRatioF = feat_extract(lingFeat_data,'VarF_')
    PsycholinguisticF = feat_extract(lingFeat_data,'PsyF_')
    WordFamiliarityF = feat_extract(lingFeat_data,'WorF_')
    # Shallow Traditional (ShTra)
    ShallowF = feat_extract(lingFeat_data,'ShaF_')
    TraditionalFormulas = feat_extract(lingFeat_data,'TraF_')

    allLingFeat = pd.concat([preprocess, 
                        EntityDensityF, 
                        EntityGridF, 
                        PhrasalF, 
                        TreeStructureF, 
                        PartOfSpeechF, 
                        TypeTokenRatioF, 
                        VariationRatioF, 
                        PsycholinguisticF, 
                        WordFamiliarityF, 
                        ShallowF, 
                        TraditionalFormulas], axis=1) 
#     allFeatures = pd.concat([allFeatures, allLingFeat], axis=1)
    
    return allLingFeat

