In [15]:
import re
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
import pandas as pd
import pickle

# Load your saved model and tokenizer
model = load_model('text_classification_model.h5')

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Function to split text into smaller segments
def split_text_into_segments(text, segment_size):
    words = text.split()
    segments = [' '.join(words[i:i + segment_size]) for i in range(0, len(words), segment_size)]
    return segments

# Function to detect AI-generated segments in the text
def detect_ai_segments(text, model, tokenizer, max_length=100, threshold=0.5):
    detected_segments = []
    text_segments = split_text_into_segments(text, max_length)
    
    for segment in text_segments:
        preprocessed_text = preprocess_text(segment)
        sequence = tokenizer.texts_to_sequences([preprocessed_text])
        padded_sequence = pad_sequences(sequence, maxlen=max_length)
        prediction = model.predict(padded_sequence)[0][0]
        if prediction > threshold:
            detected_segments.append((segment, prediction))
    
    return detected_segments

# Function to get user feedback and update the dataset
def get_user_feedback_and_update(segment, prediction):
    user_feedback = input(f"Was the following segment AI-generated? (yes/no)\n'{segment}'\nPrediction: {prediction}\n")
    label = 1 if user_feedback.lower() == 'yes' else 0
    return {'text': segment, 'label': label}

# Function to retrain the model with the updated dataset
def retrain_model(data, tokenizer):
    max_length = 100
    tokenizer.fit_on_texts(data['text'])
    X = tokenizer.texts_to_sequences(data['text'])
    X = pad_sequences(X, maxlen=max_length)
    
    y = data['label']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define the model
    embedding_dim = 100
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=embedding_dim, input_length=max_length, trainable=True))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
    
    # Save the updated model
    model.save('text_classification_model.h5')

    # Save the updated tokenizer
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle)

# Load the initial dataset (assuming you have a CSV file with initial data)
data = pd.read_csv('initial_dataset.csv')

# Example long text
input_text = """
Artificial intelligence has made significant strides in recent years.
Generated by AI: The rapid advancement of technology is reshaping industries.
Innovation in technology has transformed our daily lives, making tasks easier and more efficient.
AI text: The robot moved quickly, navigating through the obstacles with ease.
AI output: The software program executed flawlessly, completing tasks efficiently.
Human-written: Music is a universal language that transcends borders.
Exercise is crucial for maintaining physical and mental health.
The internet has revolutionized communication and access to information.
Education is a powerful tool for personal and societal growth.
Traveling allows us to experience new cultures and perspectives.
"""

# Detect AI-generated segments
detected_segments = detect_ai_segments(input_text, model, tokenizer)

# Print detected segments and get user feedback
new_data = []

for segment, prediction in detected_segments:
    feedback = get_user_feedback_and_update(segment, prediction)
    new_data.append(feedback)

# Update the dataset with new feedback
new_data_df = pd.DataFrame(new_data)
data = pd.concat([data, new_data_df], ignore_index=True)

# Save the updated dataset
data.to_csv('updated_dataset.csv', index=False)

# Retrain the model with the updated dataset
retrain_model(data, tokenizer)




Was the following segment AI-generated? (yes/no)
'Artificial intelligence has made significant strides in recent years. Generated by AI: The rapid advancement of technology is reshaping industries. Innovation in technology has transformed our daily lives, making tasks easier and more efficient. AI text: The robot moved quickly, navigating through the obstacles with ease. AI output: The software program executed flawlessly, completing tasks efficiently. Human-written: Music is a universal language that transcends borders. Exercise is crucial for maintaining physical and mental health. The internet has revolutionized communication and access to information. Education is a powerful tool for personal and societal growth. Traveling allows us to experience new cultures'
Prediction: 0.5414994359016418
 yes


NameError: name 'train_test_split' is not defined