In [1]:
import re
import sys
import csv
import nltk
import string
import pickle
import random
import warnings
import numpy as np
import pandas as pd

In [2]:
from sklearn import set_config
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import SyllableTokenizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
stopwords = list(stopwords.words('english'))
rng = 52023

In [4]:
set_config(transform_output='pandas', display='diagram')

# Data - Search Queries

In [5]:
Queries = pd.read_pickle('../data/AllQueries4746.p')
Queries

Unnamed: 0,query,class
0,US civil war causes,0
1,scooter brands,0
2,scooter brands reliable,0
3,scooter,0
4,scooter cheap,0
...,...,...
4741,House of dreams,1
4742,When did Desmond doss get married,1
4743,H,1
4744,find fact about dog,1


# Query split 90/10

In [6]:
X, y = Queries.drop('class', axis=1), Queries['class']
X_expSet, X_pipSet, y_expSet, y_pipSet = train_test_split(X, y, 
                                                    test_size=0.1, stratify=y,
                                                   random_state=rng)

### 90% - Pipeline train set

In [7]:
# Add the class column back to experiment set
X_expSet['class'] = y_expSet
expSet = X_expSet
expSet.shape

(4271, 2)

### 10% - Pipeline test set

In [8]:
# Add the class column back to pipeline test test
X_pipSet['class'] = y_pipSet
pipSet = X_pipSet
pipSet.shape

(475, 2)

# PART A

In [9]:
queries = expSet.copy()

## i. Feature Extraction 

Import a feature extraction file

In [10]:
import featExtract as fe 

### LingFeat

In [11]:
# - - - Computed separately - - - - 

### Spelling

In [12]:
Spelling_pip = Pipeline(steps=[
    ('Spelling', FunctionTransformer(fe.Spelling)),
    
])
Spelling_pip

### Punctuation and Casing

In [13]:
Punct_Casing_pip = Pipeline(steps=[
    ('Punct_Casing', FunctionTransformer(fe.Punct_Casing)),
    
])
Punct_Casing_pip

### Concreteness

In [14]:
absConcFeat_pip = Pipeline(steps=[
    ('absConcFeat', FunctionTransformer(fe.absConcFeat)),
    
])
absConcFeat_pip

### Feature Extraction Pipeline

We add all feature extraction functions together

In [18]:
feature_extraction= FeatureUnion([
    ('absConc', absConcFeat_pip),
    ('Punct_Casing', Punct_Casing_pip),
    ('Spelling', Spelling_pip)
])

feature_extraction

In [19]:
features = feature_extraction.transform(queries)
features.head()

100%|██████████| 4271/4271 [00:04<00:00, 959.21it/s]
100%|██████████| 4271/4271 [00:00<00:00, 579962.20it/s]


Unnamed: 0,query,class,ratioAbs,ratioConc,query.1,class.1,punct,casing,query.2,class.2,kidsError,misspelledCol,oneOffError
3101,SUNY college population,0,0.0,0.666667,SUNY college population,0,0,1,SUNY college population,0,0,1,1
4524,Hfgmhvhjcfhjc facts for kids,1,0.25,0.25,Hfgmhvhjcfhjc facts for kids,1,0,1,Hfgmhvhjcfhjc facts for kids,1,0,1,-1
4441,robots on star wars,1,0.0,0.5,robots on star wars,1,0,0,robots on star wars,1,0,0,0
186,venture capital,0,0.5,0.5,venture capital,0,0,0,venture capital,0,0,0,0
3552,Times Publishing,0,0.0,0.0,Times Publishing,0,0,1,Times Publishing,0,0,0,0


## ii. Pruning

In the following block of code we do:

    - Drop the duplite
    - Substitute NaN with 0
    - Drop Disco Feature (from lingFeat)
    - Drop constant variables


In [48]:
def prune(features):
    
    # 1. Drop duplicated
#     duplicated_columns=features.columns.duplicated()
#     features.loc[:, ~duplicated_columns]
    
    features = features.loc[:,~features.columns.duplicated()]

    
#     # 2. drop Disco features
    discoFeat = ['to_EntiM_C',
            'as_EntiM_C',
            'at_EntiM_C',
            'to_UEnti_C',
            'as_UEnti_C',
            'at_UEnti_C',
            'ra_SSToT_C',
            'ra_SOToT_C',
            'ra_SXToT_C',
            'ra_SNToT_C',
            'ra_OSToT_C',
            'ra_OOToT_C',
            'ra_OXToT_C',
            'ra_ONToT_C',
            'ra_XSToT_C',
            'ra_XOToT_C',
            'ra_XXToT_C',
            'ra_XNToT_C',
            'ra_NSToT_C',
            'ra_NOToT_C',
            'ra_NXToT_C',
            'ra_NNToT_C',
            'LoCohPA_S',
            'LoCohPW_S',
            'LoCohPU_S',
            'LoCoDPA_S',
            'LoCoDPW_S',
            'LoCoDPU_S']
    if all(col in df.columns for col in discoFeat):
        features = features.drop(discoFeat, axis=1, inplace=True)
    
#     # 3. Substitute NaN with 0
    features = features.fillna(0)
    
#     # 4. Drop constant variables 
#     constant_columns = []
#     for column in features.columns:
#         if features[column].nunique() == 1:
#             constant_columns.append(column)

#     pd.DataFrame(features.drop(columns=constant_columns))
    
    return features


In [49]:
prune(features)

Unnamed: 0,query,class,ratioAbs,ratioConc,punct,casing,kidsError,misspelledCol,oneOffError
3101,SUNY college population,0,0.000000,0.666667,0,1,0,1,1
4524,Hfgmhvhjcfhjc facts for kids,1,0.250000,0.250000,0,1,0,1,-1
4441,robots on star wars,1,0.000000,0.500000,0,0,0,0,0
186,venture capital,0,0.500000,0.500000,0,0,0,0,0
3552,Times Publishing,0,0.000000,0.000000,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
4581,test again,1,0.500000,0.500000,0,0,0,0,0
3647,cutterbee,0,0.000000,0.000000,0,0,0,1,-1
4229,fun facts about c3po,1,0.500000,0.000000,0,0,0,1,1
2388,france 1998 stock market,0,0.000000,0.500000,0,0,0,0,0


In [36]:
def drop_column_if_exists(df, column):
    if column in df.columns:
        df.drop(column, axis=1, inplace=True)

# Example usage
# Create a sample DataFrame
data = {
    'Name': ['John', 'Jane', 'Michael'],
    'Age': [25, 30, 40],
    'City': ['New York', 'London', 'Tokyo']
}
df = pd.DataFrame(data)

# Specify the column to check and drop
column_to_check = 'Age','City'

# Drop the column if it exists
drop_column_if_exists(df, column_to_check)

# Print the resulting DataFrame
print(df)

      Name  Age      City
0     John   25  New York
1     Jane   30    London
2  Michael   40     Tokyo


In [38]:
dt = {
    'Name': ['John', 'Jane', 'Michael'],
    'Age': [25, 30, 40],
    'City': ['New York', 'London', 'Tokyo']
}
df = pd.DataFrame(dt)

In [47]:
a = ['Age','City']
if all(item in df.columns for item in a):
    print('yes')

yes


In [44]:
df.columns

Index(['Name', 'Age', 'City'], dtype='object')

In [None]:
if set(['A','C']).issubset(df.columns):
   df['sum'] = df['A'] + df['C'] 

In [None]:
# ----- NOT COMPLETED ---------------------------

def drop_duplicate(query):
    
    duplicated_columns=query.columns.duplicated()
    
    new_df=query.loc[:, ~duplicated_columns]
    
    return new_df