In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load the dataset
df1 = pd.read_excel('bng2eng2/train/ConscientiousnessTrain.xlsx')
df2 = pd.read_excel('bng2eng2/train/AgreeablenessTrain.xlsx')
df3 = pd.read_excel('bng2eng2/train/NeuroticismTrain.xlsx')
df4 = pd.read_excel('bng2eng2/train/ExtroversionTrain.xlsx')
df5 = pd.read_excel('bng2eng2/train/OpennessTrain.xlsx')
train_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
train_df = train_df.drop("status", axis='columns')
train_df

Unnamed: 0,status_id,status_text,label
0,792,I will not eat sugar from today. I will use th...,Conscientiousness
1,921,"For now, let policymakers reserve the bandwidt...",Conscientiousness
2,922,"We can't do anything, our parents don't allow ...",Conscientiousness
3,978,Today I feel that becoming a doctor is worthwh...,Conscientiousness
4,595,"There will be no gain by forcing, the governme...",Conscientiousness
...,...,...,...
2392,2464,We were always positive. Now what is the back ...,Openness
2393,2455,"Brother if I say I am Gopalganj, and the polic...",Openness
2394,2629,I have been watching the man since 12 years,Openness
2395,2597,"It is very easy to defeat a man, but it is ver...",Openness


In [2]:
df6 = pd.read_excel('bng2eng2/test/ConscientiousnessTest.xlsx')
df7 = pd.read_excel('bng2eng2/test/AgreeablenessTest.xlsx')
df8 = pd.read_excel('bng2eng2/test/NeuroticismTest.xlsx')
df9 = pd.read_excel('bng2eng2/test/ExtroversionTest.xlsx')
df10 = pd.read_excel('bng2eng2/test/OpennessTest.xlsx')
test_df = pd.concat([df6, df7, df8, df9, df10], ignore_index=True)
test_df = test_df.drop("status", axis='columns')
test_df

Unnamed: 0,status_id,status_text,label
0,985,"Although speaking of words, the country is in ...",Conscientiousness
1,795,"Stop eating fried food , you already have gast...",Conscientiousness
2,754,Drivers and cars are responsible for this and ...,Conscientiousness
3,558,"In the name of development, one can wonder whe...",Conscientiousness
4,976,"As people die in the country, there are many c...",Conscientiousness
...,...,...,...
598,2753,The way the Prothom-alo is revealing the truth...,Openness
599,2502,They are not getting the chance to be rapist w...,Openness
600,2510,Need more vow. I need this team to shine/polis...,Openness
601,2319,"It was not right to do that, it should have be...",Openness


In [3]:
# Data preprocessing
# Convert text to lowercase
train_df['status_text'] = train_df['status_text'].apply(lambda x: x.lower())

In [4]:
print(test_df['status_text'].dtype)

object


In [5]:
test_df['status_text'] = test_df['status_text'].apply(lambda y: str(y).lower())
test_df

Unnamed: 0,status_id,status_text,label
0,985,"although speaking of words, the country is in ...",Conscientiousness
1,795,"stop eating fried food , you already have gast...",Conscientiousness
2,754,drivers and cars are responsible for this and ...,Conscientiousness
3,558,"in the name of development, one can wonder whe...",Conscientiousness
4,976,"as people die in the country, there are many c...",Conscientiousness
...,...,...,...
598,2753,the way the prothom-alo is revealing the truth...,Openness
599,2502,they are not getting the chance to be rapist w...,Openness
600,2510,need more vow. i need this team to shine/polis...,Openness
601,2319,"it was not right to do that, it should have be...",Openness


In [6]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
test_df['status_text'] = test_df['status_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
train_df

Unnamed: 0,status_id,status_text,label
0,792,"eat sugar today. use stairs get 5th floor, god...",Conscientiousness
1,921,"now, let policymakers reserve bandwidth. use l...",Conscientiousness
2,922,"can't anything, parents allow even though many...",Conscientiousness
3,978,today feel becoming doctor worthwhile. otherwi...,Conscientiousness
4,595,"gain forcing, government machinery must work p...",Conscientiousness
...,...,...,...
2392,2464,always positive. back wall? defendant this?,Openness
2393,2455,"brother say gopalganj, police says happen gopa...",Openness
2394,2629,watching man since 12 years,Openness
2395,2597,"easy defeat man, difficult win ...",Openness


In [7]:
test_df

Unnamed: 0,status_id,status_text,label
0,985,"although speaking words, country list develope...",Conscientiousness
1,795,"stop eating fried food , already gastric probl...",Conscientiousness
2,754,drivers cars responsible since value lives com...,Conscientiousness
3,558,"name development, one wonder whether natural e...",Conscientiousness
4,976,"people die country, many children country drea...",Conscientiousness
...,...,...,...
598,2753,way prothom-alo revealing truth. god knows abl...,Openness
599,2502,getting chance rapist without getting chance,Openness
600,2510,need vow. need team shine/polish continuus vic...,Openness
601,2319,"right that, said india lost final, large award...",Openness


In [8]:
# Tokenization
train_df['status_text'] = train_df['status_text'].apply(lambda x: word_tokenize(x))
test_df['status_text'] = test_df['status_text'].apply(lambda x: word_tokenize(x))
train_df

Unnamed: 0,status_id,status_text,label
0,792,"[eat, sugar, today, ., use, stairs, get, 5th, ...",Conscientiousness
1,921,"[now, ,, let, policymakers, reserve, bandwidth...",Conscientiousness
2,922,"[ca, n't, anything, ,, parents, allow, even, t...",Conscientiousness
3,978,"[today, feel, becoming, doctor, worthwhile, .,...",Conscientiousness
4,595,"[gain, forcing, ,, government, machinery, must...",Conscientiousness
...,...,...,...
2392,2464,"[always, positive, ., back, wall, ?, defendant...",Openness
2393,2455,"[brother, say, gopalganj, ,, police, says, hap...",Openness
2394,2629,"[watching, man, since, 12, years]",Openness
2395,2597,"[easy, defeat, man, ,, difficult, win, ...]",Openness


In [9]:
test_df

Unnamed: 0,status_id,status_text,label
0,985,"[although, speaking, words, ,, country, list, ...",Conscientiousness
1,795,"[stop, eating, fried, food, ,, already, gastri...",Conscientiousness
2,754,"[drivers, cars, responsible, since, value, liv...",Conscientiousness
3,558,"[name, development, ,, one, wonder, whether, n...",Conscientiousness
4,976,"[people, die, country, ,, many, children, coun...",Conscientiousness
...,...,...,...
598,2753,"[way, prothom-alo, revealing, truth, ., god, k...",Openness
599,2502,"[getting, chance, rapist, without, getting, ch...",Openness
600,2510,"[need, vow, ., need, team, shine/polish, conti...",Openness
601,2319,"[right, that, ,, said, india, lost, final, ,, ...",Openness


In [10]:
# Stemming
stemmer = PorterStemmer()
train_df['status_text'] = train_df['status_text'].apply(lambda x: [stemmer.stem(word) for word in x])
test_df['status_text'] = test_df['status_text'].apply(lambda x: [stemmer.stem(word) for word in x])
test_df

Unnamed: 0,status_id,status_text,label
0,985,"[although, speak, word, ,, countri, list, deve...",Conscientiousness
1,795,"[stop, eat, fri, food, ,, alreadi, gastric, pr...",Conscientiousness
2,754,"[driver, car, respons, sinc, valu, live, commo...",Conscientiousness
3,558,"[name, develop, ,, one, wonder, whether, natur...",Conscientiousness
4,976,"[peopl, die, countri, ,, mani, children, count...",Conscientiousness
...,...,...,...
598,2753,"[way, prothom-alo, reveal, truth, ., god, know...",Openness
599,2502,"[get, chanc, rapist, without, get, chanc]",Openness
600,2510,"[need, vow, ., need, team, shine/polish, conti...",Openness
601,2319,"[right, that, ,, said, india, lost, final, ,, ...",Openness


In [11]:
# Convert list of tokens back to text
train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join(x))
test_df['status_text'] = test_df['status_text'].apply(lambda x: ' '.join(x))
train_df

Unnamed: 0,status_id,status_text,label
0,792,"eat sugar today . use stair get 5th floor , go...",Conscientiousness
1,921,"now , let policymak reserv bandwidth . use lat...",Conscientiousness
2,922,"ca n't anyth , parent allow even though mani w...",Conscientiousness
3,978,"today feel becom doctor worthwhil . otherwis ,...",Conscientiousness
4,595,"gain forc , govern machineri must work properl...",Conscientiousness
...,...,...,...
2392,2464,alway posit . back wall ? defend thi ?,Openness
2393,2455,"brother say gopalganj , polic say happen gopal...",Openness
2394,2629,watch man sinc 12 year,Openness
2395,2597,"easi defeat man , difficult win ...",Openness


In [12]:
# Feature extraction using bag-of-words model
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(train_df['status_text'])
X_test_counts = count_vectorizer.transform(test_df['status_text'])

In [13]:
# Feature extraction using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['status_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['status_text'])

## Multinomial Naive Bayes

#### Multinomial Naive Bayes + Bag of Words

In [14]:
# Build a Multinomial Naive Bayes model using bag-of-words features
mnb_counts_model = MultinomialNB()
mnb_counts_model.fit(X_train_counts, train_df['label'])

# Evaluate the model on the testing set using bag-of-words features
y_pred_counts = mnb_counts_model.predict(X_test_counts)

In [15]:
# Calculate accuracy
accuracy_mnb_counts = accuracy_score(test_df['label'], y_pred_counts)

# Calculate F1 score
f1_mnb_counts = f1_score(test_df['label'], y_pred_counts, average='macro')

# Calculate precision
precision_mnb_counts = precision_score(test_df['label'], y_pred_counts, average='macro')

# Calculate recall
recall_mnb_counts = recall_score(test_df['label'], y_pred_counts, average='macro')

print("Accuracy using Multinomial Naive Bayes using bag-of-words features: {:.2f}%".format(accuracy_mnb_counts*100))
print("F1 Score using Multinomial Naive Bayes using bag-of-words features: {:.2f}%".format(f1_mnb_counts*100))
print("Precision using Multinomial Naive Bayes using bag-of-words features: {:.2f}%".format(precision_mnb_counts*100))
print("Recall using Multinomial Naive Bayes using bag-of-words features: {:.2f}%".format(recall_mnb_counts*100))

Accuracy using Multinomial Naive Bayes using bag-of-words features: 34.49%
F1 Score using Multinomial Naive Bayes using bag-of-words features: 34.39%
Precision using Multinomial Naive Bayes using bag-of-words features: 34.65%
Recall using Multinomial Naive Bayes using bag-of-words features: 34.29%


#### Multinomial Naive Bayes + TF-IDF

In [16]:
# Build a Multinomial Naive Bayes model using TF-IDF features
mnb_tfidf_model = MultinomialNB()
mnb_tfidf_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set using TF-IDF features
y_pred_tfidf = mnb_tfidf_model.predict(X_test_tfidf)

In [17]:
# Calculate accuracy
accuracy_mnb_tfidf = accuracy_score(test_df['label'], y_pred_tfidf)

# Calculate F1 score
f1_mnb_tfidf = f1_score(test_df['label'], y_pred_tfidf, average='macro')

# Calculate precision
precision_mnb_tfidf = precision_score(test_df['label'], y_pred_tfidf, average='macro')

# Calculate recall
recall_mnb_tfidf = recall_score(test_df['label'], y_pred_tfidf, average='macro')

print("Accuracy using Multinomial Naive Bayes and TF-IDF features: {:.2f}%".format(accuracy_mnb_tfidf*100))
print("F1 Score using Multinomial Naive Bayes and TF-IDF features: {:.2f}%".format(f1_mnb_tfidf*100))
print("Precision using Multinomial Naive Bayes and TF-IDF features: {:.2f}%".format(precision_mnb_tfidf*100))
print("Recall using Multinomial Naive Bayes and TF-IDF features: {:.2f}%".format(recall_mnb_tfidf*100))

Accuracy using Multinomial Naive Bayes and TF-IDF features: 32.17%
F1 Score using Multinomial Naive Bayes and TF-IDF features: 30.61%
Precision using Multinomial Naive Bayes and TF-IDF features: 33.22%
Recall using Multinomial Naive Bayes and TF-IDF features: 31.01%


## Logistic Regression

#### Logistic Regression + TF-IDF

In [18]:
# Build a logistic regression model using TF-IDF features
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set using TF-IDF features
y_pred_logreg = logreg_model.predict(X_test_tfidf)

In [19]:
# Calculate accuracy
accuracy_lr_tfidf = accuracy_score(test_df['label'], y_pred_logreg)

# Calculate F1 score
f1_lr_tfidf = f1_score(test_df['label'], y_pred_logreg, average='macro')

# Calculate precision
precision_lr_tfidf = precision_score(test_df['label'], y_pred_logreg, average='macro')

# Calculate recall
recall_lr_tfidf = recall_score(test_df['label'], y_pred_logreg, average='macro')

print("Accuracy using logistic regression and TF-IDF features: {:.2f}%".format(accuracy_lr_tfidf*100))
print("F1 Score using logistic regression and TF-IDF features: {:.2f}%".format(f1_lr_tfidf*100))
print("Precision using logistic regression and TF-IDF features: {:.2f}%".format(precision_lr_tfidf*100))
print("Recall using logistic regression and TF-IDF features: {:.2f}%".format(recall_lr_tfidf*100))

Accuracy using logistic regression and TF-IDF features: 31.84%
F1 Score using logistic regression and TF-IDF features: 31.24%
Precision using logistic regression and TF-IDF features: 32.14%
Recall using logistic regression and TF-IDF features: 31.19%


#### Logistic Regression + bag-of-words

In [20]:
# Build a logistic regression model using bag-of-words features
logreg_counts_model = LogisticRegression()
logreg_counts_model.fit(X_train_counts, train_df['label'])

# Evaluate the model on the testing set using bag-of-words features
y_pred_lr_counts = logreg_counts_model.predict(X_test_counts)

# Calculate accuracy
accuracy_lr_counts = accuracy_score(test_df['label'], y_pred_lr_counts)
# Calculate F1 score
f1_lr_counts = f1_score(test_df['label'], y_pred_lr_counts, average='macro')

# Calculate precision
precision_lr_counts = precision_score(test_df['label'], y_pred_lr_counts, average='macro')

# Calculate recall
recall_lr_counts = recall_score(test_df['label'], y_pred_lr_counts, average='macro')

print("Accuracy for logistic regression + bag-of-words features: {:.2f}%".format(accuracy_lr_counts*100))
print("F1 Score for logistic regression + bag-of-words features: {:.2f}%".format(f1_lr_counts*100))
print("Precision for logistic regression + bag-of-words features: {:.2f}%".format(precision_lr_counts*100))
print("Recall for logistic regression + bag-of-words features: {:.2f}%".format(recall_lr_counts*100))

Accuracy for logistic regression + bag-of-words features: 32.34%
F1 Score for logistic regression + bag-of-words features: 32.17%
Precision for logistic regression + bag-of-words features: 32.40%
Recall for logistic regression + bag-of-words features: 32.09%


## Random Forest

#### Random Forest + TF-IDF

In [21]:
rf_model = RandomForestClassifier(n_estimators=250)
rf_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set using TF-IDF features
y_pred_rf = rf_model.predict(X_test_tfidf)

In [22]:
# Calculate accuracy
accuracy_rf_tfidf = accuracy_score(test_df['label'], y_pred_rf)

# Calculate F1 score
f1_rf_tfidf = f1_score(test_df['label'], y_pred_rf, average='macro')

# Calculate precision
precision_rf_tfidf = precision_score(test_df['label'], y_pred_rf, average='macro')

# Calculate recall
recall_rf_tfidf = recall_score(test_df['label'], y_pred_rf, average='macro')

print("Accuracy using Random Forest and TF-IDF features: {:.2f}%".format(accuracy_rf_tfidf*100))
print("F1 Score using Random Forest and TF-IDF features: {:.2f}%".format(f1_rf_tfidf*100))
print("Precision using Random Forest and TF-IDF features: {:.2f}%".format(precision_rf_tfidf*100))
print("Recall using Random Forest and TF-IDF features: {:.2f}%".format(recall_rf_tfidf*100))

Accuracy using Random Forest and TF-IDF features: 34.16%
F1 Score using Random Forest and TF-IDF features: 33.01%
Precision using Random Forest and TF-IDF features: 34.71%
Recall using Random Forest and TF-IDF features: 33.22%


#### Random Forest + bag-of-words

In [23]:
rf_model_counts = RandomForestClassifier(n_estimators=300)
rf_model_counts.fit(X_train_counts, train_df['label'])

# Evaluate the model on the testing set using TF-IDF features
y_pred_rf_counts = rf_model_counts.predict(X_test_counts)

# Calculate accuracy, F1 score, precision, recall
accuracy_rf_counts = accuracy_score(test_df['label'], y_pred_rf_counts)
f1_rf_counts = f1_score(test_df['label'], y_pred_rf_counts, average='macro')
precision_rf_counts = precision_score(test_df['label'], y_pred_rf_counts, average='macro')
recall_rf_counts = recall_score(test_df['label'], y_pred_rf_counts, average='macro')

print("Accuracy using Random Forest and TF-IDF features: {:.2f}%".format(accuracy_rf_counts*100))
print("F1 Score using Random Forest and TF-IDF features: {:.2f}%".format(f1_rf_counts*100))
print("Precision using Random Forest and TF-IDF features: {:.2f}%".format(precision_rf_counts*100))
print("Recall using Random Forest and TF-IDF features: {:.2f}%".format(recall_rf_counts*100))

Accuracy using Random Forest and TF-IDF features: 31.84%
F1 Score using Random Forest and TF-IDF features: 30.98%
Precision using Random Forest and TF-IDF features: 32.00%
Recall using Random Forest and TF-IDF features: 31.25%


In [24]:
# Preprocess the text input
input_text = "I have a solution for this problem"
input_text = input_text.lower()
input_text = ' '.join([word for word in input_text.split() if word not in stop_words])
input_text = word_tokenize(input_text)
input_text = [stemmer.stem(word) for word in input_text]
input_text = ' '.join(input_text)

# Extract features from the preprocessed text input
X_input = tfidf_vectorizer.transform([input_text])

# Predict the label of the input text using the logistic regression model
y_pred_input = logreg_model.predict(X_input)[0]

# Print the predicted label
print("Predicted label for input text:", y_pred_input)

Predicted label for input text: Neuroticism


In [25]:
# Ensemble the models
ensemble_model = VotingClassifier(estimators=[('mnb_counts', mnb_counts_model), 
                                               ('mnb_tfidf', mnb_tfidf_model), 
                                               ('logreg', logreg_model), 
                                               ('rf', rf_model)], 
                                   voting='hard')

In [26]:
ensemble_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set
y_pred_ensemble = ensemble_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy_ensemble_tfidf = accuracy_score(test_df['label'], y_pred_ensemble)
print("Accuracy using ensemble of models:", accuracy_ensemble_tfidf)

# Calculate F1 score
f1_e_tfidf = f1_score(test_df['label'], y_pred_logreg, average='macro')

# Calculate precision
precision_e_tfidf = precision_score(test_df['label'], y_pred_logreg, average='macro')

# Calculate recall
recall_e_tfidf = recall_score(test_df['label'], y_pred_logreg, average='macro')

print("Accuracy using Ensemble and TF-IDF features: {:.2f}%".format(accuracy_ensemble_tfidf*100))
print("F1 Score using Ensemble and TF-IDF features: {:.2f}%".format(f1_e_tfidf*100))
print("Precision using Ensemble and TF-IDF features: {:.2f}%".format(precision_e_tfidf*100))
print("Recall using Ensemble and TF-IDF features: {:.2f}%".format(recall_e_tfidf*100))

Accuracy using ensemble of models: 0.3333333333333333
Accuracy using Ensemble and TF-IDF features: 33.33%
F1 Score using Ensemble and TF-IDF features: 31.24%
Precision using Ensemble and TF-IDF features: 32.14%
Recall using Ensemble and TF-IDF features: 31.19%


In [27]:
# Preprocess the text input
inp_txt = "I have a solution for this problem"
inp_txt = inp_txt.lower()
inp_txt = ' '.join([word for word in inp_txt.split() if word not in stop_words])
inp_txt = word_tokenize(inp_txt)
inp_txt = [stemmer.stem(word) for word in inp_txt]
inp_txt = ' '.join(inp_txt)

# Extract features from the preprocessed text input
X_inp = tfidf_vectorizer.transform([inp_txt])

# Predict the label of the input text using the logistic regression model
y_pred_inp = ensemble_model.predict(X_inp)[0]

# Print the predicted label
print("Predicted label for input text using Ensemble:", y_pred_inp)

Predicted label for input text using Ensemble: Neuroticism


#### Precision, recall, accuracy, F1 score : Comparative Study + Tabular form

In [28]:

# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# # Evaluate the performance of each model
# y_pred1 = clf1.predict(X_test)
# acc1 = accuracy_score(y_test, y_pred1)
# f1_1 = f1_score(y_test, y_pred1, average='weighted')
# prec1 = precision_score(y_test, y_pred1, average='weighted')
# rec1 = recall_score(y_test, y_pred1, average='weighted')

# y_pred2 = clf2.predict(X_test)
# acc2 = accuracy_score(y_test, y_pred2)
# f1_2 = f1_score(y_test, y_pred2, average='weighted')
# prec2 = precision_score(y_test, y_pred2, average='weighted')
# rec2 = recall_score(y_test, y_pred2, average='weighted')

# y_pred3 = clf3.predict(X_test)
# acc3 = accuracy_score(y_test, y_pred3)
# f1_3 = f1_score(y_test, y_pred3, average='weighted')
# prec3 = precision_score(y_test, y_pred3, average='weighted')
# rec3 = recall_score(y_test, y_pred3, average='weighted')

# # Evaluate the performance of the ensemble
# y_pred_ensemble = ensemble.predict(X_test)
# acc_ensemble = accuracy_score(y_test, y_pred_ensemble)
# f1_ensemble = f1_score(y_test, y_pred_ensemble, average='weighted')
# prec_ensemble = precision_score(y_test, y_pred_ensemble, average='weighted')
# rec_ensemble = recall_score(y_test, y_pred_ensemble, average='weighted')

# # Print the results
# print("Model 1 Accuracy:", acc1)
# print("Model 1 F1 Score:", f1_1)
# print("Model 1 Precision:", prec1)
# print("Model 1 Recall:", rec1)

# print("Model 2 Accuracy:", acc2)
# print("Model 2 F1 Score:", f1_2)
# print("Model 2 Precision:", prec2)
# print("Model 2 Recall:", rec2)

# print("Model 3 Accuracy:", acc3)
# print("Model 3 F1 Score:", f1_3)
# print("Model 3 Precision:", prec3)
# print("Model 3 Recall:", rec3)

# print("Ensemble Accuracy:", acc_ensemble)
# print("Ensemble F1 Score:", f1_ensemble)
# print("Ensemble Precision:", prec_ensemble)
# print("Ensemble Recall:", rec_ensemble)
