In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec

In [2]:
# Load the dataset
df1 = pd.read_excel('bng2eng2/train/ConscientiousnessTrain.xlsx')
df2 = pd.read_excel('bng2eng2/train/AgreeablenessTrain.xlsx')
df3 = pd.read_excel('bng2eng2/train/NeuroticismTrain.xlsx')
df4 = pd.read_excel('bng2eng2/train/ExtroversionTrain.xlsx')
df5 = pd.read_excel('bng2eng2/train/OpennessTrain.xlsx')
train_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
train_df = train_df.drop("status", axis='columns')

df6 = pd.read_excel('bng2eng2/test/ConscientiousnessTest.xlsx')
df7 = pd.read_excel('bng2eng2/test/AgreeablenessTest.xlsx')
df8 = pd.read_excel('bng2eng2/test/NeuroticismTest.xlsx')
df9 = pd.read_excel('bng2eng2/test/ExtroversionTest.xlsx')
df10 = pd.read_excel('bng2eng2/test/OpennessTest.xlsx')
test_df = pd.concat([df6, df7, df8, df9, df10], ignore_index=True)
test_df = test_df.drop("status", axis='columns')

# Data preprocessing
# Convert text to lowercase
train_df['status_text'] = train_df['status_text'].apply(lambda x: x.lower())
test_df['status_text'] = test_df['status_text'].apply(lambda y: str(y).lower())

# Remove stopwords
stop_words = set(stopwords.words('english'))
train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
test_df['status_text'] = test_df['status_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Tokenization
train_df['status_text'] = train_df['status_text'].apply(lambda x: word_tokenize(x))
test_df['status_text'] = test_df['status_text'].apply(lambda x: word_tokenize(x))

# Stemming
stemmer = PorterStemmer()
train_df['status_text'] = train_df['status_text'].apply(lambda x: [stemmer.stem(word) for word in x])
test_df['status_text'] = test_df['status_text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Convert list of tokens back to text
train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join(x))
test_df['status_text'] = test_df['status_text'].apply(lambda x: ' '.join(x))

In [5]:
#----------------------------------------------------------------------------------------------------
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=train_df['status_text'], vector_size=100, window=5, min_count=1, sg=1)

# Function to convert a sentence to its vector representation using Word2Vec
def sentence_to_vec(sentence):
    word_vectors = [word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv]
    if not word_vectors:
        # Return zero vector if no words are present in the Word2Vec model
        return np.zeros(word2vec_model.vector_size)
    sentence_vector = np.mean(word_vectors, axis=0)
    return sentence_vector

# Convert training and testing data to Word2Vec vectors
X_train_word2vec = np.array([sentence_to_vec(sentence) for sentence in train_df['status_text']])
X_test_word2vec = np.array([sentence_to_vec(sentence) for sentence in test_df['status_text']])

# # Build a Multinomial Naive Bayes model using Word2Vec features
# mnb_word2vec_model = MultinomialNB()
# mnb_word2vec_model.fit(X_train_word2vec, train_df['label'])

# # Evaluate the model on the testing set using Word2Vec features
# y_pred_word2vec = mnb_word2vec_model.predict(X_test_word2vec)
# accuracy_word2vec = accuracy_score(test_df['label'], y_pred_word2vec)
# print("Accuracy using Word2Vec features:", accuracy_word2vec)

# Build a logistic regression model using Word2Vec features
logreg_word2vec_model = LogisticRegression()
logreg_word2vec_model.fit(X_train_word2vec, train_df['label'])

# Evaluate the model on the testing set using Word2Vec features
y_pred_logreg_word2vec = logreg_word2vec_model.predict(X_test_word2vec)
accuracy_logreg_word2vec = accuracy_score(test_df['label'], y_pred_logreg_word2vec)
print("Accuracy using logistic regression and Word2Vec features:", accuracy_logreg_word2vec)
#-------------------------------------------------------------------------------------------------

Accuracy using logistic regression and Word2Vec features: 0.23217247097844113


In [8]:
# Feature extraction using bag-of-words model
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(train_df['status_text'])
X_test_counts = count_vectorizer.transform(test_df['status_text'])

# Feature extraction using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['status_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['status_text'])

# Build a Multinomial Naive Bayes model using bag-of-words features
mnb_counts_model = MultinomialNB()
mnb_counts_model.fit(X_train_counts, train_df['label'])

# Evaluate the model on the testing set using bag-of-words features
y_pred_counts = mnb_counts_model.predict(X_test_counts)

accuracy_counts = accuracy_score(test_df['label'], y_pred_counts)
print("Accuracy using bag-of-words features:", accuracy_counts)

# Build a Multinomial Naive Bayes model using TF-IDF features
mnb_tfidf_model = MultinomialNB()
mnb_tfidf_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set using TF-IDF features
y_pred_tfidf = mnb_tfidf_model.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(test_df['label'], y_pred_tfidf)
print("Accuracy using TF-IDF features:", accuracy_tfidf)

# Build a logistic regression model using TF-IDF features
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set using TF-IDF features
y_pred_logreg = logreg_model.predict(X_test_tfidf)
accuracy_logreg = accuracy_score(test_df['label'], y_pred_logreg)
print("Accuracy using logistic regression and TF-IDF features:", accuracy_logreg)

Accuracy using bag-of-words features: 0.3449419568822554
Accuracy using TF-IDF features: 0.32172470978441126
Accuracy using logistic regression and TF-IDF features: 0.31840796019900497


In [9]:
# Preprocess the text input
input_text = "I have a solution for this problem"
input_text = input_text.lower()
input_text = ' '.join([word for word in input_text.split() if word not in stop_words])
input_text = word_tokenize(input_text)
input_text = [stemmer.stem(word) for word in input_text]
input_text = ' '.join(input_text)

# Extract features from the preprocessed text input
X_input = tfidf_vectorizer.transform([input_text])

# Predict the label of the input text using the logistic regression model
y_pred_input = logreg_model.predict(X_input)[0]

# Print the predicted label
print("Predicted label for input text:", y_pred_input)

# Convert the preprocessed input to Word2Vec vector representation
input_vector = sentence_to_vec(input_text)

# Reshape the input vector to match the shape expected by the logistic regression model
input_vector = input_vector.reshape(1, -1)

# Make predictions on the input using the logistic regression model
predictions = logreg_word2vec_model.predict(input_vector)

logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, train_df['label'])

rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train_tfidf, train_df['label'])

# Ensemble the models
ensemble_model = VotingClassifier(estimators=[('mnb_counts', mnb_counts_model), 
                                               ('mnb_tfidf', mnb_tfidf_model), 
                                               ('logreg_tfidf', logreg_model),
                                               ('logreg_w2v', logreg_word2vec_model),  
                                               ('rf', rf_model)],                                                 
                                   voting='hard')

ensemble_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set
y_pred_ensemble = ensemble_model.predict(X_test_tfidf)
accuracy_ensemble = accuracy_score(test_df['label'], y_pred_ensemble)
print("Accuracy using ensemble of models:", accuracy_ensemble)

# Preprocess the text input
inp_txt = "I have a solution for this problem"
inp_txt = inp_txt.lower()
inp_txt = ' '.join([word for word in inp_txt.split() if word not in stop_words])
inp_txt = word_tokenize(inp_txt)
inp_txt = [stemmer.stem(word) for word in inp_txt]
inp_txt = ' '.join(inp_txt)

# Extract features from the preprocessed text input
X_inp = tfidf_vectorizer.transform([inp_txt])

# Predict the label of the input text using the logistic regression model
y_pred_inp = ensemble_model.predict(X_inp)[0]

# Print the predicted label
print("Predicted label for input text:", y_pred_inp)

Predicted label for input text: Neuroticism
Accuracy using ensemble of models: 0.32172470978441126
Predicted label for input text: Neuroticism


## Attempting GloVe

link: [https://nlp.stanford.edu/projects/glove/](https://nlp.stanford.edu/projects/glove/)

In [None]:
# from gensim.scripts.glove2word2vec import glove2word2vec
# from gensim.models import KeyedVectors

# # Convert the pre-trained GloVe embeddings to Word2Vec format
# glove_file = 'path_to_glove_file'
# word2vec_output_file = 'path_to_word2vec_output_file'
# glove2word2vec(glove_file, word2vec_output_file)

# # Load the pre-trained GloVe Word2Vec model
# glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file)

# # Function to convert a sentence to its vector representation using GloVe
# def sentence_to_vec(sentence):
#     word_vectors = [glove_model[word] for word in sentence if word in glove_model]
#     if not word_vectors:
#         # Return zero vector if no words are present in the GloVe model
#         return np.zeros(glove_model.vector_size)
#     sentence_vector = np.mean(word_vectors, axis=0)
#     return sentence_vector

# # Convert training and testing data to GloVe vectors
# X_train_glove = np.array([sentence_to_vec(sentence) for sentence in train_df['status_text']])
# X_test_glove = np.array([sentence_to_vec(sentence) for sentence in test_df['status_text']])

# # Build a Multinomial Naive Bayes model using GloVe features
# mnb_glove_model = MultinomialNB()
# mnb_glove_model.fit(X_train_glove, train_df['label'])

# # Evaluate the model on the testing set using GloVe features
# y_pred_glove = mnb_glove_model.predict(X_test_glove)
# accuracy_glove = accuracy_score(test_df['label'], y_pred_glove)
# print("Accuracy using GloVe features:", accuracy_glove)

# # Build a logistic regression model using GloVe features
# logreg_glove_model = LogisticRegression()
# logreg_glove_model.fit(X_train_glove, train_df['label'])

# # Evaluate the model on the testing set using GloVe features
# y_pred_logreg_glove = logreg_glove_model.predict(X_test_glove)
# accuracy_logreg_glove = accuracy_score(test_df['label'], y_pred_logreg_glove)
# print("Accuracy using logistic regression and GloVe features:", accuracy_logreg_glove)

## Attempting Deep Learning

In [6]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.layers import Embedding, LSTM
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Convert the sentences to sequences of word indices
# tokenizer = word2vec_model.wv.get_keras_embedding(train_embeddings=False).tokenizer
# sequences_train = tokenizer.texts_to_sequences(train_df['status_text'])
# sequences_test = tokenizer.texts_to_sequences(test_df['status_text'])

# # Pad sequences to have the same length
# max_sequence_length = 100
# X_train = pad_sequences(sequences_train, maxlen=max_sequence_length)
# X_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

# # Create the deep learning model
# model = Sequential()
# model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, weights=[word2vec_model.wv.vectors], input_length=max_sequence_length, trainable=False))
# model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(units=1, activation='sigmoid'))

# # Compile the model
# model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# # Train the model
# model.fit(X_train, train_df['label'], batch_size=64, epochs=10, validation_split=0.2)

# # Evaluate the model on the testing set
# loss, accuracy = model.evaluate(X_test, test_df['label'])
# print("Accuracy using deep learning and Word2Vec features:", accuracy)


ModuleNotFoundError: No module named 'tensorflow'