In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import Word2Vec
import tensorflow as tf
from sklearn.model_selection import train_test_split


ModuleNotFoundError: No module named 'tensorflow'

In [8]:
# Load the final.csv file into a pandas data frame
df =  pd.read_csv('final.csv', index_col=0)
df = df.sample(n=3000, random_state=42)

In [9]:
# Preprocess the text data
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
df['processed_text'] = df['text'].apply(lambda x: ' '.join([word.lower() for word in tokenizer.tokenize(x) if word.lower() not in stop_words]))


In [10]:

# Create a bag-of-words representation of the tokenized text data
vectorizer = CountVectorizer()
doc_term_matrix = vectorizer.fit_transform(df['processed_text'])


In [11]:
# Apply LDA to extract topics and their associated keywords
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_model.fit(doc_term_matrix)
for index, topic in enumerate(lda_model.components_):
    print(f'Top 10 words for topic {index}')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])


Top 10 words for topic 0


AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [None]:

# Assign a genre to each book based on the most frequent topic assigned to the book
df['genre'] = df.apply(lambda x: lda_model.transform(vectorizer.transform([x['processed_text']])).argmax(), axis=1)


In [None]:
# Create word embeddings of the preprocessed text data using Word2Vec
sentences = [tokenizer.tokenize(text.lower()) for text in df['text']]
word2vec_model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)


In [None]:

# Train a machine learning model to classify the books into their respective authorship groups based on their word embeddings
X = [word2vec_model.wv[word] for text in df['processed_text'] for word in tokenizer.tokenize(text.lower()) if word in word2vec_model.wv]
y = df['author_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(100,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(df['author_name'].unique()), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


In [1]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)


ModuleNotFoundError: No module named 'tensorflow'