In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats

In [3]:
df=pd.read_csv('../../csv/combined_3.csv')

In [7]:
print(f"The number of columns in the CSV file is: {df.shape[1]}")

The number of columns in the CSV file is: 37


In [7]:
columns_list = df.columns
print("Columns of the DataFrame:")
print(columns_list)

Columns of the DataFrame:
Index(['essay', 'topic', 'para_count', 'word_count', 'sentence_count',
       'sentence_len(mean)', 'sentence_len(max)', 'sentence_len(min)',
       'avg_stop_word', 'unique_words', 'noun', 'verb', 'adj', 'pron',
       'modal verb', 'occurrance_of_most_freq_word', 'rate_of_occurrance',
       'avg sentence complexity', 'lexical diversity', 'word_sim_p_count',
       'word_sim_p_score', 'word_sim_n_count', 'word_sim_n_score',
       'word_sim_score', 'flesch grade', 'Automated Readability',
       'coleman_liau', 'positive_sentiment', 'negative_sentiment',
       'neutral_sentiment', 'discourse_marker', 'grammar', 'sw_rate',
       'sum_repeated_ngrams', 'count_repeated_ngrams', 'max_occurrence_ngrams',
       'label'],
      dtype='object')


Split features and targets

In [5]:
X = df.iloc[:,2:36]  #independent columns
x = X.drop(columns=['word_sim_n_score'])
y = df.iloc[:,-1]   

# Information Gain

In [6]:
from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(X, y)
mutual_info

array([0.32336219, 0.26857238, 0.27727292, 0.08613382, 0.1303849 ,
       0.04739233, 0.18368021, 0.09137363, 0.18256583, 0.04049477,
       0.02322626, 0.21799095, 0.00419879, 0.17749116, 0.        ,
       0.09826652, 0.28048213, 0.29481579, 0.20859401, 0.01683277,
       0.        , 0.20981876, 0.10633491, 0.2104771 , 0.31534954,
       0.05951109, 0.00883636, 0.04414126, 0.20798496, 0.27342711,
       0.27823962, 0.24898913, 0.21145121, 0.16941967])

In [7]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

para_count                      0.323362
coleman_liau                    0.315350
word_sim_p_count                0.294816
lexical diversity               0.280482
sw_rate                         0.278240
sentence_count                  0.277273
grammar                         0.273427
word_count                      0.268572
sum_repeated_ngrams             0.248989
pron                            0.217991
count_repeated_ngrams           0.211451
Automated Readability           0.210477
word_sim_score                  0.209819
word_sim_p_score                0.208594
discourse_marker                0.207985
avg_stop_word                   0.183680
noun                            0.182566
occurrance_of_most_freq_word    0.177491
max_occurrence_ngrams           0.169420
sentence_len(max)               0.130385
flesch grade                    0.106335
avg sentence complexity         0.098267
unique_words                    0.091374
sentence_len(mean)              0.086134
positive_sentime

# Wrapper method

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

forward

In [None]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = 15,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X, y)

In [29]:
sfs.k_feature_names_

('para_count',
 'sentence_count',
 'sentence_len(max)',
 'sentence_len(min)',
 'noun',
 'verb',
 'pron',
 'modal verb',
 'avg sentence complexity',
 'lexical diversity',
 'Automated Readability',
 'negative_sentiment',
 'grammar',
 'sw_rate',
 'count_repeated_ngrams')

In [30]:
sfs.k_score_

0.9833333333333334

In [None]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

backward

In [None]:
sfs1 = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = 15,
          forward= False,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X, y)

In [33]:
sfs1.k_feature_names_

('para_count',
 'sentence_len(mean)',
 'sentence_len(max)',
 'sentence_len(min)',
 'avg_stop_word',
 'verb',
 'pron',
 'modal verb',
 'occurrance_of_most_freq_word',
 'lexical diversity',
 'word_sim_n_count',
 'flesch grade',
 'grammar',
 'sw_rate',
 'sum_repeated_ngrams')

In [None]:
sfs1.k_score_

# chi-square

In [8]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(x,y)

In [9]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)

In [10]:
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['features','Score']  #naming the dataframe columns

In [13]:
print(featureScores.nlargest(30,'Score')) 

                        features        Score
17              word_sim_p_count  1397.485748
1                     word_count  1324.487872
30           sum_repeated_ngrams  1089.031149
31         count_repeated_ngrams   477.000000
28                       grammar   325.716141
20                word_sim_score   313.725139
18              word_sim_p_score   312.001158
27              discourse_marker   297.474466
7                   unique_words   218.667883
13  occurrance_of_most_freq_word   150.968049
4              sentence_len(max)    80.929551
2                 sentence_count    78.741866
23                  coleman_liau    57.216683
32         max_occurrence_ngrams    40.438980
0                     para_count    29.803353
6                  avg_stop_word    29.766527
22         Automated Readability    26.683763
19              word_sim_n_count    26.504225
21                  flesch grade    24.052903
3             sentence_len(mean)     4.371057
24            positive_sentiment  

# Wilcoxon rank-sum statistic

In [None]:
import pandas as pd
from scipy.stats import ranksums

In [None]:
df=pd.read_csv('../csv/combined_3.csv')

In [None]:
df = df.drop(columns=['essay', 'topic'])

In [None]:

group_0 = df[df['label'] == 0]
group_1 = df[df['label'] == 1]


p_values = {}


for column in df.columns[:-1]:  
    _, p_value = ranksums(group_0[column], group_1[column])
    p_values[column] = p_value

# Sort the dictionary based on p-values
sorted_p_values = sorted(p_values.items(), key=lambda x: x[1])

# Print or manipulate the sorted list of features and p-values
for feature, p_value in sorted_p_values:
    print(f"Feature: {feature}, p-value: {p_value}")


In [None]:
sorted_p_values = sorted(p_values.items(), key=lambda x: x[1])

# Print the features with their rank
rank = 1
for feature, _ in sorted_p_values:
    print(f"Rank: {rank}, Feature: {feature}")
    rank += 1