In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict, Counter
from nltk.probability import FreqDist
import numpy as np
import re
import string
import nltk
import math

In [None]:

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df = pd.read_csv('/content/df_balanced_unshuffled_4columns.csv')

In [None]:
df['text'] = df['text'].apply(lambda x: ' '.join(eval(x)))

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[<>\\/,\'"]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
df['text'] = df['text'].apply(preprocess_text)

In [None]:
############################################################
############################################################
#PERPLEXITY#
############################################################
############################################################

In [None]:
perplexity_df = df.copy()

In [None]:
perplexity_df['sentences'] = perplexity_df['text'].apply(sent_tokenize)

In [None]:
all_sentences = [word_tokenize(sentence) for sentences in perplexity_df['sentences'] for sentence in sentences]

In [None]:
bigrams = [(w1, w2) for sentence in all_sentences for w1, w2 in zip(sentence[:-1], sentence[1:])]

bigram_freq = Counter(bigrams)
unigram_freq = Counter([word for sentence in all_sentences for word in sentence])

def calculate_perplexity(sentence):
    tokens = word_tokenize(sentence)
    bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
    vocab_size = len(unigram_freq)

    probs = []
    for w1, w2 in bigrams:
        prob = (bigram_freq[(w1, w2)] + 1) / (unigram_freq[w1] + vocab_size)  
        probs.append(prob)

    if probs:
        perplexity = np.exp(-np.mean(np.log(probs)))
        return perplexity
    else:
        return np.nan


def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    return pd.Series(sentences, index=[f'sentence{i+1}' for i in range(len(sentences))])

tokenized_df = perplexity_df['text'].apply(tokenize_sentences)
final_df_perplexity = pd.concat([tokenized_df, perplexity_df[['label']]], axis=1)


def apply_perplexity(row):
    sentence_cols = [col for col in row.index if col.startswith('sentence')]
    perplexities = [calculate_perplexity(row[col]) for col in sentence_cols if not pd.isna(row[col])]
    return pd.Series({
        'mean_perplexity': np.mean(perplexities)
    })

perplexity_results = final_df_perplexity.apply(apply_perplexity, axis=1)

perplexity_results_df = pd.DataFrame(perplexity_results, columns=['mean_perplexity'])

final_df_perplexity = pd.concat([perplexity_df['text'], perplexity_results_df], axis=1)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [None]:
final_df_perplexity.head(500)

Unnamed: 0,text,mean_perplexity,median_perplexity,std_perplexity
0,windows aligned structural spars aircraft fuse...,33594.317670,33594.317670,0.0
1,tried bacon really fucking good widely acknowl...,28637.472335,28637.472335,0.0
2,opinion common nowadays two three decades ago ...,47428.782080,47428.782080,0.0
3,walk cop tell take ecstasy every day prove thi...,18049.460322,18049.460322,0.0
4,compressor ac nt burn fuel driven car engine v...,20317.841977,20317.841977,0.0
...,...,...,...,...
495,closest thing looking would forex exchanges cu...,31130.108295,31130.108295,0.0
496,bit switch zero 1 byte 8 bits byte would look ...,17278.289371,17278.289371,0.0
497,community policing mostly staff check articles...,32802.851508,32802.851508,0.0
498,real simple definition analogy present value w...,23489.076846,23489.076846,0.0


In [None]:
output_path = '/content/final_df_perplexity.csv'
final_df_perplexity.to_csv(output_path, index=False)

In [None]:
############################################################
############################################################
#BURSTINESS#
############################################################
############################################################

In [None]:
burstiness_df = df.copy()

In [None]:
burstiness_df = burstiness_df[['text', 'label']]
burstiness_df.head(2)

Unnamed: 0,text,label
0,windows aligned structural spars aircraft fuse...,0
1,tried bacon really fucking good widely acknowl...,0


In [None]:
burstiness_df.shape

(47734, 2)

In [None]:
def compute_burstiness(text):
    words = preprocess_text(text)
    word_counts={}
    for word in words:
      if word in word_counts:
        word_counts[word]+=1
      else:
        word_counts[word]=1
    frequencies = np.array(list(word_counts.values()))
    mean_freq = np.mean(frequencies)
    variance = np.var(frequencies)
    std_dev = variance**0.5;
    burstiness_score = (variance - mean_freq) / (variance + mean_freq)
    return burstiness_score

burstiness_df['burstiness1'] = burstiness_df['text'].apply(compute_burstiness)

  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


In [None]:
burstiness_df.head(2);

In [None]:
burstiness_df.head(2);

In [None]:
def compute_burstiness(text):
    words = text.split()
    word_counts={}
    for word in words:
      if word in word_counts:
        word_counts[word]+=1
      else:
        word_counts[word]=1
    frequencies = np.array(list(word_counts.values()))
    mean_freq = np.mean(frequencies)
    variance = np.var(frequencies)
    burstiness_score = variance/mean_freq**2
    return burstiness_score

burstiness_df['burstiness2'] = burstiness_df['text'].apply(compute_burstiness)

  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


In [None]:
burstiness_df.head()

Unnamed: 0,text,label,burstiness1,burstiness2
0,windows aligned structural spars aircraft fuse...,0,0.871553,0.261974
1,tried bacon really fucking good widely acknowl...,0,0.578668,0.172023
2,opinion common nowadays two three decades ago ...,0,0.874788,0.162393
3,walk cop tell take ecstasy every day prove thi...,0,0.886011,0.118802
4,compressor ac nt burn fuel driven car engine v...,0,0.939385,0.917867


In [None]:
output_path = '/content/final_df_burstiness.csv'
burstiness_df.to_csv(output_path, index=False)

In [None]:
############################################################
############################################################
#READABILITY SCORE#
############################################################
############################################################

In [None]:
readabilityScore_df = df.copy()

In [None]:
readabilityScore_df.head()

Unnamed: 0.1,Unnamed: 0,text,avg line length,word density,label
0,0,windows aligned structural spars aircraft fuse...,26.333333,66.455696,0
1,1,tried bacon really fucking good widely acknowl...,13.8,68.115942,0
2,2,opinion common nowadays two three decades ago ...,25.222222,55.947137,0
3,3,walk cop tell take ecstasy every day prove thi...,22.454545,56.680162,0
4,4,compressor ac nt burn fuel driven car engine v...,23.0,44.927536,0


In [None]:
readabilityScore_df = readabilityScore_df[['text', 'label']]
readabilityScore_df.head(2)

Unnamed: 0,text,label
0,windows aligned structural spars aircraft fuse...,0
1,tried bacon really fucking good widely acknowl...,0


In [None]:
def count_syllables(word):
    return max(1, len(re.findall(r'[aeiou]', word, re.I)))

def flesch_kincaid_grade(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    num_words = len(words)
    num_sentences = len(sentences)
    num_syllables = sum(count_syllables(word) for word in words)

    if num_sentences == 0 or num_words == 0:
        return 0

    return 0.39 * (num_words / num_sentences) + 11.8 * (num_syllables / num_words) - 15.59

def gunning_fog(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    num_words = len(words)
    num_sentences = len(sentences)
    complex_words = len([word for word in words if count_syllables(word) > 2])

    if num_sentences == 0 or num_words == 0:
        return 0

    return 0.4 * ((num_words / num_sentences) + 100 * (complex_words / num_words))

def smog_index(text):
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    polysyllable_count = len([word for word in word_tokenize(text) if count_syllables(word) > 2])

    if num_sentences < 30:
        return 0

    return 1.0430 * math.sqrt(polysyllable_count * (30 / num_sentences)) + 3.1291

def calculate_readability_scores(text):


    fk = flesch_kincaid_grade(text)
    gf = gunning_fog(text)
    smog = smog_index(text)

    return fk, gf, smog

readabilityScore_df['flesch_kincaid_score'], readabilityScore_df['gunning_fog_score'], readabilityScore_df['smog_index_score'] = zip(*readabilityScore_df['text'].apply(calculate_readability_scores))



In [None]:
readabilityScore_df.head()
###  smog_index score is 0 because total number of sentences per row is less than 30.

Unnamed: 0,text,label,flesch_kincaid_score,gunning_fog_score,smog_index_score
0,windows aligned structural spars aircraft fuse...,0,39.775821,42.322388,0
1,tried bacon really fucking good widely acknowl...,0,21.597391,23.113043,0
2,opinion common nowadays two three decades ago ...,0,45.540256,49.661538,0
3,walk cop tell take ecstasy every day prove thi...,0,43.805,47.927273,0
4,compressor ac nt burn fuel driven car engine v...,0,67.532667,70.666667,0


In [None]:
output_path = '/content/final_df_readabilityScore.csv'
readabilityScore_df.to_csv(output_path, index=False)

In [None]:
############################################################
############################################################
#DATASET PREPARATION#
############################################################
############################################################

In [None]:
output_path = '/content/df.csv'
df.to_csv(output_path, index=False)

In [None]:
df1= pd.read_csv("/content/df.csv")
df2= pd.read_csv("/content/final_df_perplexity.csv")
df3= pd.read_csv("/content/final_df_burstiness.csv")
df4= pd.read_csv("/content/final_df_readabilityScore.csv")

In [None]:
df1.head()

Unnamed: 0.1,Unnamed: 0,text,avg line length,word density,label
0,0,windows aligned structural spars aircraft fuse...,26.333333,66.455696,0
1,1,tried bacon really fucking good widely acknowl...,13.8,68.115942,0
2,2,opinion common nowadays two three decades ago ...,25.222222,55.947137,0
3,3,walk cop tell take ecstasy every day prove thi...,22.454545,56.680162,0
4,4,compressor ac nt burn fuel driven car engine v...,23.0,44.927536,0


In [None]:
df2.head()

Unnamed: 0,text,mean_perplexity,median_perplexity,std_perplexity
0,windows aligned structural spars aircraft fuse...,33594.31767,33594.31767,0.0
1,tried bacon really fucking good widely acknowl...,28637.472335,28637.472335,0.0
2,opinion common nowadays two three decades ago ...,47428.78208,47428.78208,0.0
3,walk cop tell take ecstasy every day prove thi...,18049.460322,18049.460322,0.0
4,compressor ac nt burn fuel driven car engine v...,20317.841977,20317.841977,0.0


In [None]:
df3.head()

Unnamed: 0,text,label,burstiness1,burstiness2
0,windows aligned structural spars aircraft fuse...,0,0.871553,0.261974
1,tried bacon really fucking good widely acknowl...,0,0.578668,0.172023
2,opinion common nowadays two three decades ago ...,0,0.874788,0.162393
3,walk cop tell take ecstasy every day prove thi...,0,0.886011,0.118802
4,compressor ac nt burn fuel driven car engine v...,0,0.939385,0.917867


In [None]:
df4.head()

Unnamed: 0,text,label,flesch_kincaid_score,gunning_fog_score,smog_index_score
0,windows aligned structural spars aircraft fuse...,0,39.775821,42.322388,0
1,tried bacon really fucking good widely acknowl...,0,21.597391,23.113043,0
2,opinion common nowadays two three decades ago ...,0,45.540256,49.661538,0
3,walk cop tell take ecstasy every day prove thi...,0,43.805,47.927273,0
4,compressor ac nt burn fuel driven car engine v...,0,67.532667,70.666667,0


In [None]:
columns_from_df1 = df1[['text', 'avg line length', 'word density']]
columns_from_df2 = df2[['mean_perplexity']]
columns_from_df3 = df3[['burstiness1', 'burstiness2']]
columns_from_df4 = df4[['flesch_kincaid_score', 'gunning_fog_score']]
label_column = df1[['label']]

correct_model_df = pd.concat([columns_from_df1, columns_from_df2, columns_from_df3, columns_from_df4, label_column], axis=1)

output_path = '/content/correct_model_df.csv'
correct_model_df.to_csv(output_path, index=False)


In [None]:
correct_model_df.head()

Unnamed: 0,text,avg line length,word density,mean_perplexity,burstiness1,burstiness2,flesch_kincaid_score,gunning_fog_score,label
0,windows aligned structural spars aircraft fuse...,26.333333,66.455696,33594.31767,0.871553,0.261974,39.775821,42.322388,0
1,tried bacon really fucking good widely acknowl...,13.8,68.115942,28637.472335,0.578668,0.172023,21.597391,23.113043,0
2,opinion common nowadays two three decades ago ...,25.222222,55.947137,47428.78208,0.874788,0.162393,45.540256,49.661538,0
3,walk cop tell take ecstasy every day prove thi...,22.454545,56.680162,18049.460322,0.886011,0.118802,43.805,47.927273,0
4,compressor ac nt burn fuel driven car engine v...,23.0,44.927536,20317.841977,0.939385,0.917867,67.532667,70.666667,0


In [None]:
from google.colab import files
final_df_perplexity.to_csv('final_df_perplexity.csv', index=False)

files.download('final_df_perplexity.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
burstiness_df.to_csv('final_df_burstiness.csv', index=False)

files.download('final_df_burstiness.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
readabilityScore_df.to_csv('final_df_readabilityScore.csv', index=False)

files.download('final_df_readabilityScore.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>