In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats

In [2]:
df=pd.read_csv('../../csv/combined_3.csv')

In [7]:
print(f"The number of columns in the CSV file is: {df.shape[1]}")

The number of columns in the CSV file is: 37


In [7]:
columns_list = df.columns
print("Columns of the DataFrame:")
print(columns_list)

Columns of the DataFrame:
Index(['essay', 'topic', 'para_count', 'word_count', 'sentence_count',
       'sentence_len(mean)', 'sentence_len(max)', 'sentence_len(min)',
       'avg_stop_word', 'unique_words', 'noun', 'verb', 'adj', 'pron',
       'modal verb', 'occurrance_of_most_freq_word', 'rate_of_occurrance',
       'avg sentence complexity', 'lexical diversity', 'word_sim_p_count',
       'word_sim_p_score', 'word_sim_n_count', 'word_sim_n_score',
       'word_sim_score', 'flesch grade', 'Automated Readability',
       'coleman_liau', 'positive_sentiment', 'negative_sentiment',
       'neutral_sentiment', 'discourse_marker', 'grammar', 'sw_rate',
       'sum_repeated_ngrams', 'count_repeated_ngrams', 'max_occurrence_ngrams',
       'label'],
      dtype='object')


Split features and targets

In [24]:
X = df.iloc[:,2:36]  #independent columns
x = X.drop(columns=['word_sim_n_score'])
y = df.iloc[:,-1]   

# Information Gain

In [None]:
from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(X, y)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

# Wrapper method

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

forward

In [None]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = 15,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X, y)

In [29]:
sfs.k_feature_names_

('para_count',
 'sentence_count',
 'sentence_len(max)',
 'sentence_len(min)',
 'noun',
 'verb',
 'pron',
 'modal verb',
 'avg sentence complexity',
 'lexical diversity',
 'Automated Readability',
 'negative_sentiment',
 'grammar',
 'sw_rate',
 'count_repeated_ngrams')

In [30]:
sfs.k_score_

0.9833333333333334

In [None]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

backward

In [None]:
sfs1 = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = 15,
          forward= False,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X, y)

In [33]:
sfs1.k_feature_names_

('para_count',
 'sentence_len(mean)',
 'sentence_len(max)',
 'sentence_len(min)',
 'avg_stop_word',
 'verb',
 'pron',
 'modal verb',
 'occurrance_of_most_freq_word',
 'lexical diversity',
 'word_sim_n_count',
 'flesch grade',
 'grammar',
 'sw_rate',
 'sum_repeated_ngrams')

In [None]:
sfs1.k_score_

# chi-square

In [35]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(x,y)

In [36]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)

In [38]:
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['features','Score']  #naming the dataframe columns

In [39]:
print(featureScores.nlargest(10,'Score')) 

                        features        Score
17              word_sim_p_count  1397.485748
1                     word_count  1324.487872
30           sum_repeated_ngrams  1089.031149
31         count_repeated_ngrams   477.000000
28                       grammar   325.716141
20                word_sim_score   313.725139
18              word_sim_p_score   312.001158
27              discourse_marker   297.474466
7                   unique_words   218.667883
13  occurrance_of_most_freq_word   150.968049


# Wilcoxon rank-sum statistic

In [None]:
# Split the DataFrame based on the 'label' column
df1 = df[df['label'] == 1]
df2 = df[df['label'] == 0]

# Display the resulting DataFrames
print("DataFrame 1 (label=1):")
print(df1)

print("\nDataFrame 2 (label=0):")
print(df2)

In [4]:
# Drop the target column 'Class' from both DataFrames
data1 = df1.drop(columns=['label'])
data2 = df2.drop(columns=['label'])

# Perform the Wilcoxon rank-sum test for each feature
results = {}
for feature in data1.columns:
    statistic, p_value = stats.ranksums(data1[feature], data2[feature])
    results[feature] = {'Wilcoxon rank-sum statistic': statistic, 'P-value': p_value}

sorted_results = sorted(results.items(), key=lambda x: x[1]['Wilcoxon rank-sum statistic'], reverse=True)

# Display the results for each feature
for feature, result in results.items():
    print(f"Feature: {feature}")
    print(f"Wilcoxon rank-sum statistic: {result['Wilcoxon rank-sum statistic']}")
    print(f"P-value: {result['P-value']}")

# Set your significance level (alpha) and make a decision
alpha = 0.05

significant_features = [feature for feature, result in results.items() if result['P-value'] < alpha]
print("Significant features:", significant_features)
print("number of significant features ",len(significant_features))

Feature: essay
Wilcoxon rank-sum statistic: -1.0302842880902894
P-value: 0.3028765724597571
Feature: topic
Wilcoxon rank-sum statistic: 0.06123136595885441
P-value: 0.9511749501677654
Feature: para_count
Wilcoxon rank-sum statistic: 12.332795774104046
P-value: 6.031701026206971e-35
Feature: word_count
Wilcoxon rank-sum statistic: 11.143443046185862
P-value: 7.708060460101128e-29
Feature: sentence_count
Wilcoxon rank-sum statistic: 9.836952052955088
P-value: 7.803927086263536e-23
Feature: sentence_len(mean)
Wilcoxon rank-sum statistic: -2.698839010469072
P-value: 0.006958182757862988
Feature: sentence_len(max)
Wilcoxon rank-sum statistic: -5.859575498932111
P-value: 4.640518741546976e-09
Feature: sentence_len(min)
Wilcoxon rank-sum statistic: 1.6399357143762745
P-value: 0.10101853406053538
Feature: avg_stop_word
Wilcoxon rank-sum statistic: -7.956749783022874
P-value: 1.7661737798124844e-15
Feature: unique_words
Wilcoxon rank-sum statistic: 7.337780540177933
P-value: 2.1716486733373905e