The code in this notebook combines all the extracted feature sets to allow for experimentation.

# Load Libraries

Import libraries used in this notebook.

In [9]:
import pickle
import pandas as pd
import seaborn as sns

from sklearn.feature_selection import mutual_info_classif
from matplotlib import pyplot as plt

# Load Data Sets And Preprocess

The following block of code loads up the two data sets and starts steps towards preprocessing them for our experiments.

In [10]:
# shall be importing different datasets
SQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
# SQS = SQS.drop_duplicates(subset=['query'])
SQS.shape

(1505, 3)

In [11]:
SQS

Unnamed: 0,query,class,sID
0,becoming a fireman,0,3199
1,hotel in Pocono Mountains,0,2515
2,wedding traditions buddhism,0,2823
3,diversification in hiring,0,3033
4,traiditional swahili recipes,0,3145
...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,1,3256
1501,What is a fox's favorite kind of food?,1,2859
1502,"Show me the movie called ""The Martian""",1,3208
1503,What is the biggest rock found on Mars?,1,2676


# Load Extracted Features 

In the following block of code we load all feature sets before merging all the text based features into one dataframe before joining all feature sets together.

In [43]:
vocabFeat = pickle.load( open( "Pickles/VocabFeat.p", "rb" ) )
lexFeat = pickle.load( open( "Pickles/LexFeat.p", "rb" ) )
synFeat = pickle.load( open( "Pickles/SynFeat.p", "rb" ) )
sPFeat = pickle.load( open( "Pickles/SPFeat.p", "rb" ) )

In [44]:
synFeat

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,jj nns,jj to,nn in,nns in,in nn,dt nn,jj nn nn,nn nn nn,nn nn nns,to vb nn
0,becoming a fireman,0.0,0.0,0.333333,0.0,0.0,0.000000,0.000000,0.0,0.000,...,0.0,0.0,0.000000,0.0,0.000,0.333333,0.0,0.0,0.0,0.0
1,hotel in Pocono Mountains,0.0,0.0,0.000000,0.0,0.0,0.250000,0.000000,0.0,0.000,...,0.0,0.0,0.250000,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
2,wedding traditions buddhism,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000,...,0.0,0.0,0.000000,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
3,diversification in hiring,0.0,0.0,0.000000,0.0,0.0,0.333333,0.000000,0.0,0.000,...,0.0,0.0,0.333333,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
4,traiditional swahili recipes,0.0,0.0,0.000000,0.0,0.0,0.000000,0.333333,0.0,0.000,...,0.0,0.0,0.000000,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,0.0,0.0,0.181818,0.0,0.0,0.090909,0.090909,0.0,0.000,...,0.0,0.0,0.090909,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
1501,What is a fox's favorite kind of food?,0.0,0.0,0.125000,0.0,0.0,0.125000,0.125000,0.0,0.000,...,0.0,0.0,0.125000,0.0,0.125,0.125000,0.0,0.0,0.0,0.0
1502,"Show me the movie called ""The Martian""",0.0,0.0,0.285714,0.0,0.0,0.000000,0.142857,0.0,0.000,...,0.0,0.0,0.000000,0.0,0.000,0.142857,0.0,0.0,0.0,0.0
1503,What is the biggest rock found on Mars?,0.0,0.0,0.125000,0.0,0.0,0.125000,0.000000,0.0,0.125,...,0.0,0.0,0.000000,0.0,0.000,0.000000,0.0,0.0,0.0,0.0


Vocab

In [14]:
vocabFeat.shape

(1505, 34)

In [15]:
vocabFeat.head(3)

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,top250SterCount,top250SterRatAnt,...,com,net,org,edu,gov,http,AND,OR,quotes,inter
0,0.333333,becoming a fireman,0.666667,0.0,5.05,0.666667,2.646667,0.333333,0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.5,hotel in Pocono Mountains,0.5,0.0,6.15,0.75,3.9725,0.75,0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,wedding traditions buddhism,1.0,0.0,8.05,0.666667,4.683333,0.333333,0,0.0,...,0,0,0,0,0,0,0,0,0,0


lexFeat

In [16]:
lexFeat.shape

(1505, 27)

In [17]:
lexFeat.head(3)

Unnamed: 0,query,ld,ls1,ls2,vs1,vs2,cvs1,ndw,ttr,cttr,...,adjv,totalSyl,avgSyl,simWords,comWords,greatestSyl,leastSyl,numChars,numWords,avgLenWord
0,becoming a fireman,0.666667,0.5,0.333333,0.0,0.0,0.0,3,1.0,1.224745,...,0.0,7,2.333333,1,2,3,1,18,3,6.0
1,hotel in Pocono Mountains,0.75,0.666667,0.5,0.0,0.0,0.0,4,1.0,1.414214,...,0.0,8,2.0,3,1,3,1,25,4,6.25
2,wedding traditions buddhism,1.0,0.666667,0.666667,0.0,0.0,0.0,3,1.0,1.224745,...,0.0,7,2.333333,2,1,3,2,27,3,9.0


synFeat

In [18]:
synFeat.shape

(1505, 42)

In [19]:
synFeat.head(3)

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,nn nn,jj nn,nn nns,to vb,jj nns,jj to,nn in,nns in,in nn,dt nn
0,becoming a fireman,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
1,hotel in Pocono Mountains,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
2,wedding traditions buddhism,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
synFeat

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,nn nn,jj nn,nn nns,to vb,jj nns,jj to,nn in,nns in,in nn,dt nn
0,becoming a fireman,0.0,0.0,0.333333,0.0,0.0,0.000000,0.000000,0.0,0.000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000,0.333333
1,hotel in Pocono Mountains,0.0,0.0,0.000000,0.0,0.0,0.250000,0.000000,0.0,0.000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.250000,0.0,0.000,0.000000
2,wedding traditions buddhism,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000,0.000000
3,diversification in hiring,0.0,0.0,0.000000,0.0,0.0,0.333333,0.000000,0.0,0.000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.333333,0.0,0.000,0.000000
4,traiditional swahili recipes,0.0,0.0,0.000000,0.0,0.0,0.000000,0.333333,0.0,0.000,...,0.0,0.333333,0.333333,0.0,0.0,0.0,0.000000,0.0,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,0.0,0.0,0.181818,0.0,0.0,0.090909,0.090909,0.0,0.000,...,0.0,0.090909,0.000000,0.0,0.0,0.0,0.090909,0.0,0.000,0.000000
1501,What is a fox's favorite kind of food?,0.0,0.0,0.125000,0.0,0.0,0.125000,0.125000,0.0,0.000,...,0.0,0.125000,0.000000,0.0,0.0,0.0,0.125000,0.0,0.125,0.125000
1502,"Show me the movie called ""The Martian""",0.0,0.0,0.285714,0.0,0.0,0.000000,0.142857,0.0,0.000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000,0.142857
1503,What is the biggest rock found on Mars?,0.0,0.0,0.125000,0.0,0.0,0.125000,0.000000,0.0,0.125,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000,0.000000


sPFeat

In [21]:
sPFeat.shape

(1505, 6)

In [22]:
sPFeat.head()

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError,punct,casing
0,0,becoming a fireman,0,0,0,0
1,1,hotel in Pocono Mountains,1,0,0,1
2,0,wedding traditions buddhism,0,0,0,0
3,0,diversification in hiring,0,0,0,0
4,2,traiditional swahili recipes,1,0,0,0


In [45]:
#-- merge vocabFeat & lexFeat

SQS_Feat = pd.merge(vocabFeat, lexFeat, left_index=True, right_index=True)
SQS_Feat.drop(columns = ['query_y'], inplace = True)
SQS_Feat.rename(columns = {'query_x':'query'}, inplace = True)

In [46]:
SQS_Feat.shape

(1505, 60)

In [47]:
# -- add sPFeat (spelling features)

SQS_Feat = pd.merge(SQS_Feat, sPFeat, left_index=True, right_index=True)
SQS_Feat.drop(columns = ['query_y'], inplace = True)
SQS_Feat.rename(columns = {'query_x':'query'}, inplace = True)

In [48]:
SQS_Feat.shape

(1505, 65)

In [49]:
# -- add synFeat

SQS_Feat = pd.merge(SQS_Feat, synFeat, left_index=True, right_index=True)
SQS_Feat.drop(columns = ['query_y'], inplace = True)
SQS_Feat.rename(columns = {'query_x':'query'}, inplace = True)

In [50]:
SQS_Feat.shape

(1505, 110)

In [51]:
SQS_Feat.head()

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,top250SterCount,top250SterRatAnt,...,jj nns,jj to,nn in,nns in,in nn,dt nn,jj nn nn,nn nn nn,nn nn nns,to vb nn
0,0.333333,becoming a fireman,0.666667,0.0,5.05,0.666667,2.646667,0.333333,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
1,0.5,hotel in Pocono Mountains,0.5,0.0,6.15,0.75,3.9725,0.75,0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,wedding traditions buddhism,1.0,0.0,8.05,0.666667,4.683333,0.333333,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.333333,diversification in hiring,0.666667,0.0,13.11,0.333333,5.6,0.333333,0,0.0,...,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,traiditional swahili recipes,1.0,0.0,6.95,0.333333,2.316667,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
# Add class

SQS_Feat = pd.merge(SQS_Feat, SQS, left_index=True, right_index=True)
SQS_Feat.drop(columns = ['query_y', 'sID'], inplace = True)
SQS_Feat.rename(columns = {'query_x':'query'}, inplace = True)

In [53]:
SQS_Feat.shape

(1505, 111)

In [54]:
(SQS_Feat['class']).value_counts()

0    1204
1     301
Name: class, dtype: int64

In [55]:
SQS_Feat.head(3)

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,top250SterCount,top250SterRatAnt,...,jj to,nn in,nns in,in nn,dt nn,jj nn nn,nn nn nn,nn nn nns,to vb nn,class
0,0.333333,becoming a fireman,0.666667,0.0,5.05,0.666667,2.646667,0.333333,0,0.0,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0
1,0.5,hotel in Pocono Mountains,0.5,0.0,6.15,0.75,3.9725,0.75,0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,wedding traditions buddhism,1.0,0.0,8.05,0.666667,4.683333,0.333333,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [56]:
SQS_Feat.tail(3)

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,top250SterCount,top250SterRatAnt,...,jj to,nn in,nns in,in nn,dt nn,jj nn nn,nn nn nn,nn nn nns,to vb nn,class
1502,0.428571,"Show me the movie called ""The Martian""",0.571429,0.0,6.21,0.714286,3.001429,0.142857,1,0.142857,...,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,1
1503,0.75,What is the biggest rock found on Mars?,0.25,0.0,10.95,0.75,3.89625,0.375,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1504,0.857143,What is the top game this week?,0.142857,0.0,5.11,0.857143,3.718571,0.285714,1,0.142857,...,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,1


In [57]:
SQS_Feat.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity', 'SVEN', 'top250SterCount', 'top250SterRatAnt',
       ...
       'jj to', 'nn in', 'nns in', 'in nn', 'dt nn', 'jj nn nn', 'nn nn nn',
       'nn nn nns', 'to vb nn', 'class'],
      dtype='object', length=111)

In [58]:
SQS_Feat.shape

(1505, 111)

In [59]:
pickle.dump(SQS_Feat, open( "DataSets/SQSFeatures/SQSFeat.p", "wb" ))

In [60]:
pwd

'/Users/assoumerredempta/Documents/aSpring_2023/RYSe_Final/FeatureExtraction'

# Return Aggregated Extracted Features

The following block of code returns the extracted features aggregated with their respective data sets.

In [30]:
# pickle.dump(SWCAll, open( "DataSets/SWCFeatures/SWCFeat.p", "wb" ) )
# pickle.dump(SQSAll, open( "DataSets/SQSFeatures/SQSFeat.p", "wb" ) )

In [31]:
# SQSAll.shape

NameError: name 'SQSAll' is not defined

In [None]:
# SQSAll.isnull().sum().sum()

In [None]:
# SQSAll[' Level4'].value_counts()

In [None]:
# SQSAll['vs1'].unique()

In [None]:
# dt=SQSAll.copy()

In [None]:
# pd.Series.nunique(SQSAll[' Level0'])

In [None]:
cormat=dt.loc[:, dt.apply(pd.Series.nunique) != 1].corr()
# cormat.head()

In [None]:
# cormat.iloc[:5, ]

In [None]:
# varHigh=cormat['class'].abs().sort_values(ascending=False)[:20].index.tolist()
# varHigh

The following is the correlation matrix of features with highest correlation 

In [None]:
plt.figure(figsize=(20,8))
sns.heatmap(cormat.loc[varHigh, varHigh], annot=True)
plt.title('Correlation matrix of top 20 most correlated features ')
plt.show()

In [None]:
varLeast=cormat['class'].abs().sort_values(ascending=False)[-20:].index.tolist() # least correlated variables
varLeast = ['class','rbs',
 'dt nn',
 'prp',
 'pos',
 'ex',
 'nnps',
 'wdt',
 'jjr',
 'dt',
 'http',
 'rbr',
 'jj to vb',
 'jj to',
 'nnp',
 'vbd',
 'md',
 'OR',
 'vbn',
 'jj nn nn',
 'rp']

In [None]:
plt.figure(figsize=(20,8))
sns.heatmap(cormat.loc[varLeast, varLeast], annot=True, cmap = 'PuRd')
plt.title('Correlation matrix of top 20 least correlated features ')
plt.show()

In [None]:
cormat['class'].iloc[1:].abs().sort_values(ascending=False)[:20].plot(kind='barh')

In [None]:
cormat['class'].iloc[1:].abs().sort_values(ascending=False)[:10]

In [None]:
(cormat['class'].iloc[1:].abs().sort_values(ascending=False)[-10:])


In [None]:
cormat['class'].iloc[1:][cormat['class'].iloc[1:].abs() > 0.3]

In [None]:
SQSAll['class'].value_counts()

In [None]:
SQSAll.shape

In [None]:
print('done')

Let's look at the how features that are classified as abstraction process, language developme, and that are not yet classified (stored in unClassFeat) in either abstraction process or language developmet

In [None]:
# unClassFeat plus 'class'
unClassFeat = ['class',    
'coreVocab',
'nonCoreVocab',
'queryComplexity',
'stopCount',
'com',
# 'net',
'org',
# 'edu',
# 'gov',
'http',
# 'AND',
'OR',
'quotes',
'inter']

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(cormat.loc[unClassFeat, unClassFeat], annot=True, cmap='Purples')
plt.title('Correlation matrix of unclassified features')
plt.show()

In [None]:
AbstrProcFeat = ['class',
'casing',
'punct',
'cc',
'cd',
'dt',
'ex',
'fw',
'in',
'jj',
'jjr',
'jjs',
'md', 
'nn',
'nnp',
'nnps',
'nns',
'pos',
'prp',
'rb',
'rbr',
'rbs',
'rp',
'to',
'vb',
'vbd',
'vbg',
'vbn',
'vbp',
'vbz',
'wdt',
'wp',
'wrb',
'nn nn',
'jj nn',
'nn nns',
'to vb',
'jj nns',
'jj to',
'nn in',
'nns in',
'in nn',
'dt nn',
'jj nn nn',
'nn nn nn',
'jj to vb',
'nn nn nns',
'to vb nn'
# 'level1',
# 'level2',
# 'level3',
# 'level4',
# 'level5',
# 'level6',
# 'level7',
# 'MeanLevel'
]

In [None]:
plt.figure(figsize=(40,20))
sns.heatmap(cormat.loc[AbstrProcFeat, AbstrProcFeat], annot=True, cmap='Greens')
plt.title('Correlation matrix of abstract process features')
plt.show()

In [None]:
# LangDevFeat plus 'class'
LangDevFeat = ['class',
'ld',
'ls1',
'ls2',
# 'vs1',
# 'cvs1',
# 'vs2',
'ndw',
'ttr',
'cttr',
'rttr',
# 'logttr',
'vv1',
'svv1',
'cvv1',
'vv2',
'nv',
'adjv',
'numWords',
'numChars',
'avgLenWord',
'totalSyl',
'avgSyl',
'simWords',
# 'comWords', #------ this is giving an error that 'comWordsgreatestSyl' is not index yet it is not found anywhere ??
'greatestSyl', 
'leastSyl',             
'minAoA',
'maxAoA',
'ratioAoA',
'SVEN',
'top250SterCount',
'top250SterRatAnt',
'top250SterRatCon',
'top250NonSterCount',
'top250NonSterRatAnt',
'top250NonSterRatCon',
'top50SterCount',
'top50SterRatAnt',
'top50SterAntCon',
'top50NonSterCount',
'top50NonSterRatAnt',
'top50NonSterAntCon',
'tfidfAll',
'tfidfS',
'tfidfNS',              
'inter',
'numSpellingErrors',
'kidsError',
'offByOne']

In [None]:
SQSAll['vs1'].unique()

In [None]:
SQSAll['vs2'].unique()

In [None]:
SQSAll['cvs1'].unique()

In [None]:
# plt.figure(figsize=(35,15))
# sns.heatmap(cormat.loc[LangDevFeat, LangDevFeat], annot=True, cmap='Blues')
# plt.title('Correlation matrix of language development features')
# plt.show()

### Correlation of features on str and nstr separately

In [None]:
SQSAllstr = SQSAll[SQSAll['class'] == 1]
SQSAllnonstr = SQSAll[SQSAll['class'] == 0]

In [None]:
SQSAllstr.shape

In [None]:
SQSAllnonstr.shape 

In [None]:
unClassFeatNew = unClassFeat.copy() 


In [None]:
len(unClassFeatNew)

In [None]:
unwanted1 = {'class', 'inter', 'http', 'quotes'} # ---- the removed features have constant values
unClassFeatNew1 = [ele for ele in unClassFeatNew if ele not in unwanted1]


In [None]:
unClassFeatNew1

In [None]:
# plt.figure(figsize=(10,6))
# sns.heatmap(cormat.loc[unClassFeatNew, unClassFeatNew], annot=True, cmap='Purples')
# plt.title('Correlation matrix of unclassified features on str')
# plt.show()

In [None]:
cormatStr=SQSAllstr.loc[:, SQSAllstr.apply(pd.Series.nunique) != 1].corr() 
cormatNonStr=SQSAllnonstr.loc[:, SQSAllnonstr.apply(pd.Series.nunique) != 1].corr()


In [None]:
SQSAllstr['inter'].unique()

In [None]:
SQSAllstr['quotes'].unique()

In [None]:
SQSAllstr['http'].unique()

In [None]:
SQSAll['http'].unique()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(cormatStr.loc[unClassFeatNew1, unClassFeatNew1], annot=True, cmap='Purples')
plt.title('Correlation matrix of unclassified features on str')
plt.show()


In [None]:
# remove the .com from nonSrt 

unwanted2 = {'class', 'com'} # ---- the removed features have constant values
unClassFeatNew2 = [ele for ele in unClassFeatNew if ele not in unwanted2]


In [None]:
len(unClassFeatNew2)

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(cormatNonStr.loc[unClassFeatNew2, unClassFeatNew2], annot=True, cmap='Purples')
plt.title('Correlation matrix of unclassified features on nonStr')
plt.show()

### Compute selection

In [None]:
unFeat=unClassFeat.copy()
unFeat.remove('class')

In [None]:
# Feature selection 

mi=mutual_info_classif(SQSAll[unFeat], SQSAll['class'])

In [None]:
pd.Series(mi, index=unFeat).sort_values(ascending=False)

### Compute feature importance 
We looked at the feature importance from the different classification of the 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

#### - Unclassified features 


In [None]:
X_scaled=StandardScaler().fit_transform(SQSAll[unFeat])

In [None]:
model=RandomForestClassifier(n_estimators=200, bootstrap=False,
                             criterion='entropy', class_weight='balanced', random_state=42, n_jobs=-1)
model.fit(X_scaled, SQSAll['class'])

In [None]:
model.feature_importances_

In [None]:
pd.Series(model.feature_importances_, index=unFeat).sort_values(ascending=False)

In [None]:
model.score(X_scaled, SQSAll['class']) # ... this not write since the data need to be split into train and test 

#### - LangDevFeat

In [None]:
LangDevFeat.remove('class')

In [None]:
X_scaled=StandardScaler().fit_transform(SQSAll[LangDevFeat])

In [None]:
model=RandomForestClassifier(n_estimators=200, bootstrap=False,
                             criterion='entropy', class_weight='balanced', random_state=42, n_jobs=-1)
model.fit(X_scaled, SQSAll['class'])

In [None]:
pd.Series(model.feature_importances_, index=LangDevFeat).sort_values(ascending=False)


#### - AbstrProcFeat

In [None]:
AbstrProcFeat.remove('class')

In [None]:
X_scaled=StandardScaler().fit_transform(SQSAll[AbstrProcFeat])

In [None]:
model=RandomForestClassifier(n_estimators=200, bootstrap=False,
                             criterion='entropy', class_weight='balanced', random_state=42, n_jobs=-1)
model.fit(X_scaled, SQSAll['class'])

In [None]:
pd.Series(model.feature_importances_, index=AbstrProcFeat).sort_values(ascending=False)

#### All features

In [None]:
allFeat = unFeat + LangDevFeat + AbstrProcFeat
# len(allFeat)

In [None]:
X_scaled=StandardScaler().fit_transform(SQSAll[allFeat])

In [None]:
# The model parameter values are from the best hyperparameters from 'ModelTry_TYSe.ipyn' file
model=RandomForestClassifier(n_estimators=50, criterion='entropy',
                                        max_depth=None, max_features=0.75, min_samples_split=3)
model.fit(X_scaled, SQSAll['class'])

In [None]:
featImp_allFeat = pd.Series(model.feature_importances_, index=allFeat).sort_values(ascending=False)
featImp_allFeat[:80]

In [None]:
df_featImp=pd.DataFrame({'features':featImp_allFeat.index, 'values':featImp_allFeat.values})
df_featImp

There are 101 feat only yet, they are supposed to be 118. Where are the other, what happened to them? What is the explanation? 

In [None]:
print('done')

In [None]:
pwd

## Data simple exploration

In this sec

In [None]:
SQSAll.shape # all queries 


In [None]:
(SQSAll['class']==1).value_counts()

In [None]:
SQSAllstr = SQSAll[SQSAll['class'] == 1]
SQSAllnonstr = SQSAll[SQSAll['class'] == 0]

In [None]:
from matplotlib import pyplot as plt
import numpy as np

plt.subplot(1, 2, 1) 
SQSAllstr['numWords'].hist()
plt.rcParams['figure.figsize'] = [13, 6]
plt.title("Distribution of number of words per query - str")
plt.xlabel(' ')
plt.ylabel(' ')

plt.subplot(1, 2, 2) 
SQSAllnonstr['numWords'].hist()
plt.rcParams['figure.figsize'] = [13, 6]
plt.title("Distribution of number of words per query - nStr")
plt.xlabel(' ')
plt.ylabel(' ')

plt.show()

In [None]:
from matplotlib import pyplot as plt
import numpy as np

plt.subplot(1, 2, 1) 
SQSAllstr['numChars'].hist()
plt.rcParams['figure.figsize'] = [13, 6]
plt.title("Distribution of number of char per query - str")
plt.xlabel(' ')
plt.ylabel(' ')

plt.subplot(1, 2, 2) 
SQSAllnonstr['numChars'].hist()
plt.rcParams['figure.figsize'] = [13, 6]
plt.title("Distribution of number of char per query - nStr")
plt.xlabel(' ')
plt.ylabel(' ')

plt.show()

##### -- str

In [None]:
SQSAllstr['numWords'].describe()

In [None]:
SQSAllstr['numChars'].describe()

In [None]:
SQSAllstr['avgLenWord'].describe()

##### -- nonstr

In [None]:
SQSAllnonstr['numWords'].describe()

In [None]:
SQSAllnonstr['numChars'].describe()

In [None]:
SQSAllnonstr['avgLenWord'].describe()

In [None]:
pd.DataFrame({
    'casing': SQSAll['casing'].value_counts(),
    'class': SQSAll['class'].value_counts()
})

In [None]:
pd.crosstab(SQSAll['class'], SQSAll['casing'])

In [None]:
SQSAll['class'] == 0

In [None]:
SQS.shape

In [None]:
SQS

In [None]:
adult = SQS[SQS['class'] == 0]
adult

In [None]:
kids = SQS[SQS['class'] == 1]
kids

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec_adults=CountVectorizer()
vec_kids=CountVectorizer()

X_adults=vec_adults.fit_transform(adult['query'])
adults_feats=vec_adults.get_feature_names()
X_kids=vec_kids.fit_transform(kids['query'])
kids_feats=vec_kids.get_feature_names()

adults_counts_df = pd.DataFrame(X_adults.toarray(), columns=adults_feats)
kids_counts_df = pd.DataFrame(X_kids.toarray(), columns=kids_feats)

In [None]:
ac_df = adults_counts_df.sum().sort_values(ascending=False) # ac_df = adults counts 
ac_df.head(30) # for adults

In [None]:
sns.histplot(ac_df, kde=True)
plt.show()

In [None]:
kc_df = kids_counts_df.sum().sort_values(ascending=False)
kc_df.head(30) # For kids

In [None]:
sns.histplot(kc_df, kde=True)
plt.show()

In [None]:
kids.shape

### Without stopwords

In [None]:
vec_adults=CountVectorizer(stop_words='english')
vec_kids=CountVectorizer(stop_words='english')


X_adults=vec_adults.fit_transform(adult['query'])
adults_feats=vec_adults.get_feature_names()
X_kids=vec_kids.fit_transform(kids['query'])
kids_feats=vec_kids.get_feature_names()

adults_counts_df = pd.DataFrame(X_adults.toarray(), columns=adults_feats)
kids_counts_df = pd.DataFrame(X_kids.toarray(), columns=kids_feats)

In [None]:
adults_counts_df.sum().sort_values(ascending=False)

In [None]:
kids_c_df_ns = kids_counts_df.sum().sort_values(ascending=False) # not stopwords
kids_c_df_ns

In [None]:
All