In [34]:
# Step 1: Data Preprocessing
import pandas as pd

# Load the datasets
train_essays = pd.read_csv('train_essays.csv')
test_essays = pd.read_csv('test_essays.csv')
train_prompts = pd.read_csv('train_prompts.csv')

# Merge training essays and prompts based on prompt_id
train_data = train_essays.merge(train_prompts, on='prompt_id')

# Check for missing data
train_data.isnull().sum()  # Check for missing values in the merged dataset
train_data.dropna(subset=['text'], inplace=True)  # Remove rows with missing text

In [35]:
train_data.head()

Unnamed: 0,id,prompt_id,text,generated,prompt_name,instructions,source_text
0,0059830c,0,Cars. Cars have been around since they became ...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,005db917,0,Transportation is a large necessity in most co...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
3,00940276,0,How often do you ride in a car? Do you drive a...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."


In [36]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')

In [37]:
tokenizer_ex = tokenizer(["Cars. Cars have been around since they became",
                         "what is your name"],
                         max_length=1000,padding=True,truncation=True)

In [38]:
print(tokenizer_ex.input_ids)

[[101, 3765, 1012, 3765, 2031, 2042, 2105, 2144, 2027, 2150, 102], [101, 2054, 2003, 2115, 2171, 102, 0, 0, 0, 0, 0]]


In [39]:
def tokenize(data):
    tokens_input = tokenizer(data,max_length=1000,padding=True,truncation=True)
    return tokens_input.input_ids

In [40]:
train_data['text_tokenize'] = train_data['text'].apply(tokenize)

In [41]:
train_data['prompt_name_tokenize'] = train_data['prompt_name'].apply(tokenize)

In [42]:
train_data['instructions_tokenize'] = train_data['instructions'].apply(tokenize)

In [43]:
train_data['source_text_tokenize'] = train_data['source_text'].apply(tokenize)

In [44]:
train_data.head()

Unnamed: 0,id,prompt_id,text,generated,prompt_name,instructions,source_text,text_tokenize,prompt_name_tokenize,instructions_tokenize,source_text_tokenize
0,0059830c,0,Cars. Cars have been around since they became ...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ...","[101, 3765, 1012, 3765, 2031, 2042, 2105, 2144...","[101, 2482, 1011, 2489, 3655, 102]","[101, 4339, 2019, 4654, 24759, 5162, 7062, 949...","[101, 1001, 1999, 2446, 7575, 1010, 2166, 3632..."
1,005db917,0,Transportation is a large necessity in most co...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ...","[101, 5193, 2003, 1037, 2312, 13185, 1999, 208...","[101, 2482, 1011, 2489, 3655, 102]","[101, 4339, 2019, 4654, 24759, 5162, 7062, 949...","[101, 1001, 1999, 2446, 7575, 1010, 2166, 3632..."
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ...","[101, 1000, 2637, 1005, 1055, 2293, 6771, 2007...","[101, 2482, 1011, 2489, 3655, 102]","[101, 4339, 2019, 4654, 24759, 5162, 7062, 949...","[101, 1001, 1999, 2446, 7575, 1010, 2166, 3632..."
3,00940276,0,How often do you ride in a car? Do you drive a...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ...","[101, 2129, 2411, 2079, 2017, 4536, 1999, 1037...","[101, 2482, 1011, 2489, 3655, 102]","[101, 4339, 2019, 4654, 24759, 5162, 7062, 949...","[101, 1001, 1999, 2446, 7575, 1010, 2166, 3632..."
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ...","[101, 3765, 2024, 1037, 6919, 2518, 1012, 2027...","[101, 2482, 1011, 2489, 3655, 102]","[101, 4339, 2019, 4654, 24759, 5162, 7062, 949...","[101, 1001, 1999, 2446, 7575, 1010, 2166, 3632..."


In [45]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     1378 non-null   object
 1   prompt_id              1378 non-null   int64 
 2   text                   1378 non-null   object
 3   generated              1378 non-null   int64 
 4   prompt_name            1378 non-null   object
 5   instructions           1378 non-null   object
 6   source_text            1378 non-null   object
 7   text_tokenize          1378 non-null   object
 8   prompt_name_tokenize   1378 non-null   object
 9   instructions_tokenize  1378 non-null   object
 10  source_text_tokenize   1378 non-null   object
dtypes: int64(2), object(9)
memory usage: 118.5+ KB


In [46]:
train_data['generated'].value_counts()

generated
0    1375
1       3
Name: count, dtype: int64

In [47]:
train_data['id'] = 

SyntaxError: invalid syntax (2698201068.py, line 1)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report,confusion_matrix
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
import xgboost as xgb

In [29]:
dropped_cols = ['text','generated','prompt_name','instructions','source_text']

In [30]:
X = train_data.drop(columns=dropped_cols,axis=0)
y = train_data['generated']

In [31]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=45)

In [32]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier()
}

In [48]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(x_train)

NameError: name 'StandardScaler' is not defined

In [33]:
for model_name, model in models.items():
    # Fit the model
    model.fit(x_train, y_train)

    # Make predictions
    y_pred = model.predict(x_test)
    y_prob = model.predict_proba(x_test)[:,1]

    cr = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    pr_auc = average_precision_score(y_test, y_prob)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Create a PrettyTable for the classification report
    report_table = PrettyTable()
    report_table.field_names = ['Class', 'Precision', 'Recall', 'F1-Score', 'Support']
    for class_name, metrics in cr.items():
        if class_name.isdigit():  # Skip non-numeric class names (if any)
            report_table.add_row([class_name,
                                  metrics['precision'],
                                  metrics['recall'],
                                  metrics['f1-score'],
                                  metrics['support']])

    # Print results
    print(f"Model: {model_name}")
    print(f"Confusion matrix:\n{cm}")
    print("-" * 40)
    print("Classification report:")
    print(report_table)
    print("-" * 40)
    print(f"ROC AUC: {roc_auc:.2f}")
    print(f"PR AUC: {pr_auc:.2f}")
    print("-" * 40)
    print(f"Accuracy :{accuracy}")
    print(f"Precision :{precision}")
    print(f"recall :{recall}")
    print("-" * 40)

ValueError: could not convert string to float: '945de5f3'

In [13]:
# # Transform test data using the same TF-IDF vectorizer
# X_test = tfidf_vectorizer.transform(test_essays['text'])

# # Use the trained model to predict on the test data
# test_predictions = rf_classifier.predict(X_test)

# # Prepare for submission
# submission = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})

# # Save the submission to a CSV file
# submission.to_csv('essay_predictions.csv', index=False)