# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from textstat import flesch_reading_ease
import textstat

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score, recall_score

  from pandas.core import (


In [2]:
#df=pd.read_csv(r'train_essays.csv')

In [3]:
#df['generated'].value_counts()

# Loading the dataset

In [4]:
df=pd.read_csv(r'merged_essays.csv')

In [5]:
df

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
1483,106,1,The American electoral system is often critici...,1
1484,107,1,"The electoral college, with its red and blue s...",1
1485,108,1,The American political landscape is often pain...,1
1486,109,1,"The American heartland, vast and diverse, is o...",1


In [6]:
df['generated'].value_counts()

generated
0    1375
1     113
Name: count, dtype: int64

Here, we're using merged_essays because it has more number of AI generated essays which makes the model not biased

In [7]:
# Creating a new column indicating human or AI
df['source'] = np.where(df['generated'] == 0, 'human', 'AI')

# Select a random sample of 113 human-generated texts
human_texts = df[df['source'] == 'human'].sample(n=113, random_state=42)

# Select all AI-generated texts
AI_texts = df[df['source'] == 'AI']

# Concatenate the selected human-generated texts with all AI-generated texts
matched_df = pd.concat([human_texts, AI_texts])

# Shuffle the DataFrame to mix human and AI texts
matched_df = matched_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Now matched_df contains an equal number of human and AI generated texts


In [8]:
matched_df

Unnamed: 0,id,prompt_id,text,generated,source
0,25939bac,1,"Greetings Mr. State Senator, Looking over the ...",0,human
1,69,1,The electoral college is a product of compromi...,1,AI
2,5,0,Car-free cities represent more than a transpor...,1,AI
3,92,0,Cities are often the epicenters of environment...,1,AI
4,33,0,Traffic congestion is a ubiquitous problem in ...,1,AI
...,...,...,...,...,...
221,6b3d3d11,0,The advantages of limiting car usage would be ...,0,human
222,daf3cd80,0,The debate on the pros and cons of car usage h...,0,human
223,0a13d187,0,Have you ever wondered what will happen if we ...,0,human
224,64,1,The American political landscape is often pain...,1,AI


# Text Preprocessing

In [9]:
#Lemmatization
wc=WordNetLemmatizer()
corpus=[]
new_sw=[]
#preprocessing all the data
for i in range(0,226):
    review=matched_df['text'][i]
    review=re.sub('[^a-zA-Z]'," ",review)
    review=review.lower()
    #tokenization
    review=review.split()
    #removing stop words and applying stemming
    d=[]
    for word in review:
        if(word not in new_sw):
            d.append(wc.lemmatize(word))
    review=" ".join(d)
#all the preprocessed review is saved in corpus
    corpus.append(review)

In [10]:
corpus

['greeting mr state senator looking over the electoral college there ha come to be a few interesting detail yes the process is beautifully thought out but a we look at it deeper it ha it flaw the founding father have given u the electoral college which anarchist are very unappreciative veiwing it a a nondemocratic way to vote if the electoral college system go then american can have their full constitutional right using this process voter are not voting for the elector of their choice voter are voting on higher qualified voter to vote for the elector the whole process is a violator of the amendment entitled to voting having this maze of voting for voter what if a voter confuses of who the elector is voting for so their vote isint actually going to the candidate of their choice in before the election of john f kennedy the segregationist of the louisiana legislator came very close to replacing all of the democratic elector with elector who opposed john f kennedy so all the voter in favor

# Feature Engineering

To differentiate between human-written and AI-generated essays: we're performing feature engineering.

Feature engineering involves creating new features from the existing data to improve the performance of machine learning models. Here are some feature engineering ideas we're implementing on our dataset: Text Length, Lexical Diversity, Flesch Reading Ease.


Text Length (text_length):  

Text length refers to the number of words or characters in a piece of text.  
In our dataset, the text_length column contains the length of each essay in terms of the number of words.


Lexical Diversity (lexical_diversity): 

Lexical diversity measures the variety of unique words used in a piece of text relative to the total number of words.  
It is calculated as the ratio of the number of unique words to the total number of words.  
A higher lexical diversity score indicates a greater variety of words used in the text.     
In our dataset, the lexical_diversity column contains the lexical diversity score for each essay.  


Flesch Reading Ease (flesch_reading_ease):  

Flesch Reading Ease is a readability metric that estimates how easy or difficult it is to read a piece of text.  
It is calculated based on the average sentence length and the average number of syllables per word.  
Higher Flesch Reading Ease scores indicate easier-to-read text, while lower scores indicate more difficult text.   Scores typically range from 0 to 100, with higher scores indicating better readability.   
In our dataset, the flesch_reading_ease column contains the Flesch Reading Ease score for each essay.  


Understanding these features can provide insights into the characteristics of the essays in our dataset, such as their length, complexity, and readability.   


# Code:

In [11]:
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# from textstat import flesch_reading_ease
# import textstat

# Feature Engineering: Text Length
matched_df['text_length'] = matched_df['text'].apply(lambda x: len(x))

# Feature Engineering: Lexical Diversity
def calculate_lexical_diversity(text):
    tokens = word_tokenize(text.lower())
    unique_tokens = set(tokens)
    return len(unique_tokens) / len(tokens) if len(tokens) > 0 else 0

matched_df['lexical_diversity'] = matched_df['text'].apply(calculate_lexical_diversity)

# Feature Engineering: Readability Metrics
matched_df['flesch_reading_ease'] = matched_df['text'].apply(lambda x: flesch_reading_ease(x))


In [12]:
matched_df['text_length']

0      2311
1      1274
2      2072
3      1596
4       542
       ... 
221    2839
222    1795
223    2695
224    1956
225    1492
Name: text_length, Length: 226, dtype: int64

In [13]:
matched_df['lexical_diversity']

0      0.431871
1      0.623810
2      0.475884
3      0.629771
4      0.733333
         ...   
221    0.471002
222    0.473538
223    0.402174
224    0.522523
225    0.496689
Name: lexical_diversity, Length: 226, dtype: float64

In [14]:
matched_df['flesch_reading_ease']

0      60.04
1      30.40
2      24.17
3      38.21
4      34.46
       ...  
221    66.78
222    67.15
223    69.52
224    44.44
225    79.19
Name: flesch_reading_ease, Length: 226, dtype: float64

In [15]:
# Create new DataFrame with selected columns
new_df = matched_df[['text', 'generated', 'source', 'text_length', 'lexical_diversity', 'flesch_reading_ease']]

In [16]:
new_df

Unnamed: 0,text,generated,source,text_length,lexical_diversity,flesch_reading_ease
0,"Greetings Mr. State Senator, Looking over the ...",0,human,2311,0.431871,60.04
1,The electoral college is a product of compromi...,1,AI,1274,0.623810,30.40
2,Car-free cities represent more than a transpor...,1,AI,2072,0.475884,24.17
3,Cities are often the epicenters of environment...,1,AI,1596,0.629771,38.21
4,Traffic congestion is a ubiquitous problem in ...,1,AI,542,0.733333,34.46
...,...,...,...,...,...,...
221,The advantages of limiting car usage would be ...,0,human,2839,0.471002,66.78
222,The debate on the pros and cons of car usage h...,0,human,1795,0.473538,67.15
223,Have you ever wondered what will happen if we ...,0,human,2695,0.402174,69.52
224,The American political landscape is often pain...,1,AI,1956,0.522523,44.44


# Understanding source & text_length

In [17]:
# Filter AI-generated text with lengths less than 500, 1000 and greater than 1000
ai_less_than_1000 = new_df[(new_df['source'] == 'AI') & (new_df['text_length'] < 1000)].shape[0]
ai_less_than_500 = new_df[(new_df['source'] == 'AI') & (new_df['text_length'] < 500)].shape[0]
ai_greater_than_1000 = new_df[(new_df['source'] == 'AI') & (new_df['text_length'] > 1000)].shape[0]

# Filter human text with lengths less than 500, 1000 and greater than 1000
human_less_than_1000 = new_df[(new_df['source'] == 'human') & (new_df['text_length'] < 1000)].shape[0]
human_less_than_500 = new_df[(new_df['source'] == 'human') & (new_df['text_length'] < 500)].shape[0]
human_greater_than_1000 = new_df[(new_df['source'] == 'human') & (new_df['text_length'] > 1000)].shape[0]

print("AI-generated text with length < 500:", ai_less_than_500)
print("AI-generated text with length < 1000:", ai_less_than_1000)
print("AI-generated text with length > 1000:", ai_greater_than_1000)

print("Human text with length < 500:", human_less_than_500)
print("Human text with length < 1000:", human_less_than_1000)
print("Human text with length > 1000:", human_greater_than_1000)


AI-generated text with length < 500: 2
AI-generated text with length < 1000: 22
AI-generated text with length > 1000: 91
Human text with length < 500: 0
Human text with length < 1000: 0
Human text with length > 1000: 113


# Observations from text_length

AI-generated Text:  

There are 22 instances of AI-generated text with lengths less than 1000 words, indicating that a significant portion of the AI-generated content falls within this range.  
Furthermore, there are 91 instances of AI-generated text with lengths greater than 1000 words, suggesting that a substantial number of AI-generated essays are longer and potentially more detailed or complex.

Human Text:  

Interestingly, there are no instances of human-generated text with lengths less than 500 or less than 1000 words. This could imply that human-generated essays tend to be longer and more substantial compared to AI-generated essays in this dataset.  
Additionally, there are 113 instances of human-generated text with lengths greater than 1000 words, indicating that longer essays are more prevalent among human-authored content.  



# Understanding source & lexical_diversity

In [18]:
# Filter human and AI data separately
human_df = new_df[new_df['source'] == 'human']
ai_df = new_df[new_df['source'] == 'AI']

# Calculate min, max, and mode (most frequent) lexical diversity for human
human_min_lexical_diversity = human_df['lexical_diversity'].min()
human_max_lexical_diversity = human_df['lexical_diversity'].max()
human_mode_lexical_diversity = human_df['lexical_diversity'].mode()[0]  # Get the first mode if multiple modes exist

# Calculate min, max, and mode (most frequent) lexical diversity for AI
ai_min_lexical_diversity = ai_df['lexical_diversity'].min()
ai_max_lexical_diversity = ai_df['lexical_diversity'].max()
ai_mode_lexical_diversity = ai_df['lexical_diversity'].mode()[0]  # Get the first mode if multiple modes exist

# Print the results
print("Human Lexical Diversity:")
print("Minimum:", human_min_lexical_diversity)
print("Maximum:", human_max_lexical_diversity)
print("Mode:", human_mode_lexical_diversity)

print("\nAI Lexical Diversity:")
print("Minimum:", ai_min_lexical_diversity)
print("Maximum:", ai_max_lexical_diversity)
print("Mode:", ai_mode_lexical_diversity)


Human Lexical Diversity:
Minimum: 0.27548543689320387
Maximum: 0.5044091710758377
Mode: 0.27548543689320387

AI Lexical Diversity:
Minimum: 0.3807829181494662
Maximum: 0.7738095238095238
Mode: 0.5


# Observations from lexical_diversity

Human Essays: 

The range of lexical diversity values for human essays is between approximately 0.2755 and 0.5044. This means that among human essays, there is a range of diversity in the vocabulary used. Some essays have a relatively low variety of unique words compared to the total number of words, while others have a higher variety.

AI Essays: 

On the other hand, the range of lexical diversity values for AI-generated essays is between approximately 0.3808 and 0.7738. This suggests that AI-generated essays tend to have a wider variation in the diversity of vocabulary compared to human essays. Some AI-generated essays have a relatively low variety of unique words, while others have a higher variety.

Considering the range of values and the fact that the maximum lexical diversity value for AI essays is higher than that for human essays, we can conclude that, on average, AI-generated essays tend to use a more diversified vocabulary compared to human-generated essays.

# Understanding source & flesch_reading_ease

In [19]:
# Defining the readability ranges
readability_ranges = {
    'Very Difficult': (0, 30),
    'Difficult': (30, 50),
    'Moderate': (50, 70),
    'Easy': (70, 90),
    'Very Easy': (90, 100)
}

# Filter AI and human data separately
ai_df = new_df[new_df['source'] == 'AI']
human_df = new_df[new_df['source'] == 'human']

# Function to classify readability based on Flesch Reading Ease score
def classify_readability(score):
    for category, (lower, upper) in readability_ranges.items():
        if lower <= score < upper:
            return category

# Apply readability classification to AI essays
ai_df['Readability'] = ai_df['flesch_reading_ease'].apply(classify_readability)

# Apply readability classification to human essays
human_df['Readability'] = human_df['flesch_reading_ease'].apply(classify_readability)

# Print counts of essays in each readability category for AI and human sources
print("AI Essays Readability:")
print(ai_df['Readability'].value_counts())

print("\nHuman Essays Readability:")
print(human_df['Readability'].value_counts())


AI Essays Readability:
Readability
Difficult         59
Very Difficult    40
Moderate          14
Name: count, dtype: int64

Human Essays Readability:
Readability
Moderate     71
Easy         38
Difficult     4
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ai_df['Readability'] = ai_df['flesch_reading_ease'].apply(classify_readability)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  human_df['Readability'] = human_df['flesch_reading_ease'].apply(classify_readability)


# Observations from flesch_reading_ease



In terms of readability, AI-generated essays tend to be more challenging, with a majority falling into the "Difficult" and "Very Difficult" categories. Conversely, human-generated essays are generally easier to read, with a larger proportion falling into the "Moderate" and "Easy" categories. This suggests that human-generated content may be more accessible and comprehensible to readers compared to AI-generated content.

# Models

# Naive Bayes

In [20]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score


# Feature Extraction (using TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the max_features parameter
X = tfidf_vectorizer.fit_transform(corpus)

# Splitting Data

y = matched_df['source']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Model Evaluation
y_pred_nb = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_nb)
print("Accuracy:", accuracy)

Accuracy: 0.9782608695652174


In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

          AI       1.00      0.95      0.98        22
       human       0.96      1.00      0.98        24

    accuracy                           0.98        46
   macro avg       0.98      0.98      0.98        46
weighted avg       0.98      0.98      0.98        46



# Logistic Regression

In [22]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(corpus, matched_df['source'], test_size=0.2, random_state=42)

# Vectorizing the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Training the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_tfidf, y_train)

# Predicting on the test set
y_pred_lr = logreg_model.predict(X_test_tfidf)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", accuracy)


Logistic Regression Accuracy: 0.9782608695652174


In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

          AI       1.00      0.95      0.98        22
       human       0.96      1.00      0.98        24

    accuracy                           0.98        46
   macro avg       0.98      0.98      0.98        46
weighted avg       0.98      0.98      0.98        46



# Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

# Training the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Predicting on the test set
y_pred_rf = rf_model.predict(X_test_tfidf)

# Calculating accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)



Random Forest Accuracy: 0.9782608695652174


In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

          AI       1.00      0.95      0.98        22
       human       0.96      1.00      0.98        24

    accuracy                           0.98        46
   macro avg       0.98      0.98      0.98        46
weighted avg       0.98      0.98      0.98        46



# Observations

The precision, recall, and F1-score values being the same across different models (logistic regression, random forest, naive Bayes) indicate consistent performance in classifying essays into AI and human categories. With high precision and recall values for both AI and human categories, it suggests that the models are effectively distinguishing between AI-generated and human-generated essays with minimal misclassification.