In [4]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Load data
train_data = pd.read_csv("train_essays.csv")
test_data = pd.read_csv("test_essays.csv")

# Custom preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization, stopwords removal, and other steps are already covered by TfidfVectorizer
    
    return text

# Split the data
X_train, X_val, y_train, y_val = train_test_split(train_data['text'], train_data['generated'], test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and Support Vector Machine classifier
model = make_pipeline(
    TfidfVectorizer(preprocessor=preprocess_text),
    SVC(probability=True, random_state=42)
)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'tfidfvectorizer__max_features': [5000, 10000, None],
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Validate the model
y_val_pred = grid_search.predict(X_val)
print(classification_report(y_val, y_val_pred))

# Make predictions on the test set
test_predictions = grid_search.predict_proba(test_data['text'])[:, 1]

# Create a submission file
submission = pd.DataFrame({'id': test_data['id'], 'generated': test_predictions})
submission.to_csv("submission.csv", index=False)




Best Hyperparameters: {'svc__C': 0.1, 'svc__kernel': 'linear', 'tfidfvectorizer__max_features': 5000}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.00      0.00      0.00         1

    accuracy                           1.00       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      1.00      0.99       276



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
final=pd.read_csv('submission.csv')
final.head()

Unnamed: 0,id,generated
0,0000aaaa,0.012896
1,1111bbbb,0.012896
2,2222cccc,0.012896


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Load data
train_data = pd.read_csv("train_essays.csv")
test_data = pd.read_csv("test_essays.csv")

# Check the structure of the training data
print(train_data.info())

# Verify the column names
print(train_data.columns)

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization, stopwords removal, and other steps are already covered by TfidfVectorizer
    
    return text

# Check if 'generated' column is present in the training data
if 'generated' not in train_data.columns:
    raise KeyError("The 'generated' column is not present in the training data.")

# Split the data
X_train, X_val, y_train, y_val = train_test_split(train_data['text'], train_data['generated'], test_size=0.2, random_state=42)

# Rest of the code...
# Rest of the code...

# Create a pipeline with TF-IDF vectorizer and Support Vector Machine classifier
model = make_pipeline(
    TfidfVectorizer(preprocessor=preprocess_text),
    SVC(probability=True, random_state=42)
)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'tfidfvectorizer__max_features': [5000, 10000, None],
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Validate the model
y_val_pred = grid_search.predict(X_val)
print(classification_report(y_val, y_val_pred))

# Make predictions on the test set
test_predictions = grid_search.predict_proba(test_data['text'])[:, 1]

# Create a submission file
submission = pd.DataFrame({'id': test_data['id'], 'generated': test_predictions})
submission.to_csv("submission.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1378 non-null   object
 1   prompt_id  1378 non-null   int64 
 2   text       1378 non-null   object
 3   generated  1378 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 43.2+ KB
None
Index(['id', 'prompt_id', 'text', 'generated'], dtype='object')




Best Hyperparameters: {'svc__C': 0.1, 'svc__kernel': 'linear', 'tfidfvectorizer__max_features': 5000}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.00      0.00      0.00         1

    accuracy                           1.00       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      1.00      0.99       276



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.metrics import accuracy_score

# Assuming y_val_pred and y_val are the predicted and true labels for the validation set
accuracy = accuracy_score(y_val, y_val_pred)

print("Accuracy:", accuracy)


Accuracy: 0.9963768115942029


In [8]:
final=pd.read_csv('submission.csv')
final.head()

Unnamed: 0,id,generated
0,0000aaaa,0.012896
1,1111bbbb,0.012896
2,2222cccc,0.012896
