In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [2]:
df1 = pd.read_csv('train_essays.csv')
df2 = pd.read_csv('train_prompts.csv')
df3 = pd.read_csv('test_essays.csv')

In [6]:
print(df1.columns)
print(df2.columns)
print(df3.columns)

Index(['id', 'prompt_id', 'text', 'generated'], dtype='object')
Index(['prompt_id', 'prompt_name', 'instructions', 'source_text'], dtype='object')
Index(['id', 'prompt_id', 'text'], dtype='object')


In [7]:

# To build a model for identifying which essay was written by middle and high school students and which was written using a large language model (LLM), you can follow these steps:

# Data Preprocessing:

# Merge the train_essays and train_prompts datasets based on the common prompt_id to create a single training dataset.
# Prepare the data by handling missing values, text cleaning, and any other necessary preprocessing steps.
# Feature Engineering:

# Extract relevant features from the essay text and prompt instructions that can be used for classification. You can consider techniques like TF-IDF, Word Embeddings, or other NLP feature extraction methods.
# Model Selection:

# Choose a suitable machine learning model for classification. Logistic Regression, Random Forest, or Support Vector Machine can be good starting points.
# You may also explore more advanced models such as neural networks (e.g., LSTM or BERT-based models) if you have a large dataset.
# Training and Evaluation:

# Split your training data into training and validation sets for model training and hyperparameter tuning.
# Train the selected model on the training data.
# Evaluate the model's performance on the validation set using appropriate evaluation metrics (e.g., accuracy, precision, recall, F1-score).
# Hyperparameter Tuning:

# Perform hyperparameter tuning to optimize the model's performance. You can use techniques like grid search or random search.
# Test Data Prediction:

# Load the test_essays.csv dataset.
# Preprocess the test data using the same preprocessing steps applied to the training data.
# Use the trained model to predict the labels for the test essays (0 for students, 1 for LLM-generated).
# Submission:

# Prepare a submission file in the required format (e.g., a CSV file with id and generated columns).
# Submit your predictions to the Kaggle competition.


In [9]:
train_data = pd.merge(df1,df2,how='inner',on='prompt_id')

In [10]:
train_data

Unnamed: 0,id,prompt_id,text,generated,prompt_name,instructions,source_text
0,0059830c,0,Cars. Cars have been around since they became ...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,005db917,0,Transportation is a large necessity in most co...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
3,00940276,0,How often do you ride in a car? Do you drive a...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
...,...,...,...,...,...,...,...
1373,fc66f374,1,The Electoral College was originally establish...,0,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...
1374,fcb87d59,1,"Dear senator, I think that the presidential el...",0,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...
1375,fcd93e2d,1,The electoral college is a group of electors t...,0,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...
1376,fcfe84cb,1,An electoral College compromises between elect...,0,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...


In [11]:
train_data.isna().sum()

id              0
prompt_id       0
text            0
generated       0
prompt_name     0
instructions    0
source_text     0
dtype: int64

In [12]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1378 non-null   object
 1   prompt_id     1378 non-null   int64 
 2   text          1378 non-null   object
 3   generated     1378 non-null   int64 
 4   prompt_name   1378 non-null   object
 5   instructions  1378 non-null   object
 6   source_text   1378 non-null   object
dtypes: int64(2), object(5)
memory usage: 75.5+ KB


In [13]:
train_data.columns

Index(['id', 'prompt_id', 'text', 'generated', 'prompt_name', 'instructions',
       'source_text'],
      dtype='object')

In [14]:
train_data.head()

Unnamed: 0,id,prompt_id,text,generated,prompt_name,instructions,source_text
0,0059830c,0,Cars. Cars have been around since they became ...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,005db917,0,Transportation is a large necessity in most co...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
3,00940276,0,How often do you ride in a car? Do you drive a...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."


In [15]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string

In [16]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [18]:
print(stopwords.stopwords)

AttributeError: 'WordListCorpusReader' object has no attribute 'stopwords'