In [1]:
# nltk.download('omw-1.4')
# nltk.download('wordnet')

In [2]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
df1 = pd.read_excel('bng2eng2/train/ConscientiousnessTrain.xlsx')
df2 = pd.read_excel('bng2eng2/train/AgreeablenessTrain.xlsx')
df3 = pd.read_excel('bng2eng2/train/NeuroticismTrain.xlsx')
df4 = pd.read_excel('bng2eng2/train/ExtroversionTrain.xlsx')
df5 = pd.read_excel('bng2eng2/train/OpennessTrain.xlsx')
train_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
train_df = train_df.drop("status", axis='columns')
train_df

Unnamed: 0,status_id,status_text,label
0,792,I will not eat sugar from today. I will use th...,Conscientiousness
1,921,"For now, let policymakers reserve the bandwidt...",Conscientiousness
2,922,"We can't do anything, our parents don't allow ...",Conscientiousness
3,978,Today I feel that becoming a doctor is worthwh...,Conscientiousness
4,595,"There will be no gain by forcing, the governme...",Conscientiousness
...,...,...,...
2392,2464,We were always positive. Now what is the back ...,Openness
2393,2455,"Brother if I say I am Gopalganj, and the polic...",Openness
2394,2629,I have been watching the man since 12 years,Openness
2395,2597,"It is very easy to defeat a man, but it is ver...",Openness


In [3]:
df6 = pd.read_excel('bng2eng2/test/ConscientiousnessTest.xlsx')
df7 = pd.read_excel('bng2eng2/test/AgreeablenessTest.xlsx')
df8 = pd.read_excel('bng2eng2/test/NeuroticismTest.xlsx')
df9 = pd.read_excel('bng2eng2/test/ExtroversionTest.xlsx')
df10 = pd.read_excel('bng2eng2/test/OpennessTest.xlsx')
test_df = pd.concat([df6, df7, df8, df9, df10], ignore_index=True)
test_df = test_df.drop("status", axis='columns')
test_df

Unnamed: 0,status_id,status_text,label
0,985,"Although speaking of words, the country is in ...",Conscientiousness
1,795,"Stop eating fried food , you already have gast...",Conscientiousness
2,754,Drivers and cars are responsible for this and ...,Conscientiousness
3,558,"In the name of development, one can wonder whe...",Conscientiousness
4,976,"As people die in the country, there are many c...",Conscientiousness
...,...,...,...
598,2753,The way the Prothom-alo is revealing the truth...,Openness
599,2502,They are not getting the chance to be rapist w...,Openness
600,2510,Need more vow. I need this team to shine/polis...,Openness
601,2319,"It was not right to do that, it should have be...",Openness


In [4]:
# Data preprocessing
# Convert text to lowercase
train_df['status_text'] = train_df['status_text'].apply(lambda x: x.lower())

In [5]:
print(test_df['status_text'].dtype)

object


In [6]:
test_df['status_text'] = test_df['status_text'].apply(lambda y: str(y).lower())
test_df

Unnamed: 0,status_id,status_text,label
0,985,"although speaking of words, the country is in ...",Conscientiousness
1,795,"stop eating fried food , you already have gast...",Conscientiousness
2,754,drivers and cars are responsible for this and ...,Conscientiousness
3,558,"in the name of development, one can wonder whe...",Conscientiousness
4,976,"as people die in the country, there are many c...",Conscientiousness
...,...,...,...
598,2753,the way the prothom-alo is revealing the truth...,Openness
599,2502,they are not getting the chance to be rapist w...,Openness
600,2510,need more vow. i need this team to shine/polis...,Openness
601,2319,"it was not right to do that, it should have be...",Openness


In [7]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
test_df['status_text'] = test_df['status_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
train_df

Unnamed: 0,status_id,status_text,label
0,792,"eat sugar today. use stairs get 5th floor, god...",Conscientiousness
1,921,"now, let policymakers reserve bandwidth. use l...",Conscientiousness
2,922,"can't anything, parents allow even though many...",Conscientiousness
3,978,today feel becoming doctor worthwhile. otherwi...,Conscientiousness
4,595,"gain forcing, government machinery must work p...",Conscientiousness
...,...,...,...
2392,2464,always positive. back wall? defendant this?,Openness
2393,2455,"brother say gopalganj, police says happen gopa...",Openness
2394,2629,watching man since 12 years,Openness
2395,2597,"easy defeat man, difficult win ...",Openness


In [8]:
test_df

Unnamed: 0,status_id,status_text,label
0,985,"although speaking words, country list develope...",Conscientiousness
1,795,"stop eating fried food , already gastric probl...",Conscientiousness
2,754,drivers cars responsible since value lives com...,Conscientiousness
3,558,"name development, one wonder whether natural e...",Conscientiousness
4,976,"people die country, many children country drea...",Conscientiousness
...,...,...,...
598,2753,way prothom-alo revealing truth. god knows abl...,Openness
599,2502,getting chance rapist without getting chance,Openness
600,2510,need vow. need team shine/polish continuus vic...,Openness
601,2319,"right that, said india lost final, large award...",Openness


In [9]:
# Tokenization
train_df['status_text'] = train_df['status_text'].apply(lambda x: word_tokenize(x))
test_df['status_text'] = test_df['status_text'].apply(lambda x: word_tokenize(x))
train_df

Unnamed: 0,status_id,status_text,label
0,792,"[eat, sugar, today, ., use, stairs, get, 5th, ...",Conscientiousness
1,921,"[now, ,, let, policymakers, reserve, bandwidth...",Conscientiousness
2,922,"[ca, n't, anything, ,, parents, allow, even, t...",Conscientiousness
3,978,"[today, feel, becoming, doctor, worthwhile, .,...",Conscientiousness
4,595,"[gain, forcing, ,, government, machinery, must...",Conscientiousness
...,...,...,...
2392,2464,"[always, positive, ., back, wall, ?, defendant...",Openness
2393,2455,"[brother, say, gopalganj, ,, police, says, hap...",Openness
2394,2629,"[watching, man, since, 12, years]",Openness
2395,2597,"[easy, defeat, man, ,, difficult, win, ...]",Openness


In [10]:
test_df

Unnamed: 0,status_id,status_text,label
0,985,"[although, speaking, words, ,, country, list, ...",Conscientiousness
1,795,"[stop, eating, fried, food, ,, already, gastri...",Conscientiousness
2,754,"[drivers, cars, responsible, since, value, liv...",Conscientiousness
3,558,"[name, development, ,, one, wonder, whether, n...",Conscientiousness
4,976,"[people, die, country, ,, many, children, coun...",Conscientiousness
...,...,...,...
598,2753,"[way, prothom-alo, revealing, truth, ., god, k...",Openness
599,2502,"[getting, chance, rapist, without, getting, ch...",Openness
600,2510,"[need, vow, ., need, team, shine/polish, conti...",Openness
601,2319,"[right, that, ,, said, india, lost, final, ,, ...",Openness


In [11]:
# Stemming
stemmer = PorterStemmer()
train_df['status_text'] = train_df['status_text'].apply(lambda x: [stemmer.stem(word) for word in x])
test_df['status_text'] = test_df['status_text'].apply(lambda x: [stemmer.stem(word) for word in x])

In [12]:
# # Lemmatization
# lemmatizer = WordNetLemmatizer()
# train_df['status_text'] = train_df['status_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
# test_df['status_text'] = test_df['status_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

test_df

Unnamed: 0,status_id,status_text,label
0,985,"[although, speak, word, ,, countri, list, deve...",Conscientiousness
1,795,"[stop, eat, fri, food, ,, alreadi, gastric, pr...",Conscientiousness
2,754,"[driver, car, respons, sinc, valu, live, commo...",Conscientiousness
3,558,"[name, develop, ,, one, wonder, whether, natur...",Conscientiousness
4,976,"[peopl, die, countri, ,, mani, children, count...",Conscientiousness
...,...,...,...
598,2753,"[way, prothom-alo, reveal, truth, ., god, know...",Openness
599,2502,"[get, chanc, rapist, without, get, chanc]",Openness
600,2510,"[need, vow, ., need, team, shine/polish, conti...",Openness
601,2319,"[right, that, ,, said, india, lost, final, ,, ...",Openness


In [13]:
# Convert list of tokens back to text
train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join(x))
test_df['status_text'] = test_df['status_text'].apply(lambda x: ' '.join(x))
train_df

Unnamed: 0,status_id,status_text,label
0,792,"eat sugar today . use stair get 5th floor , go...",Conscientiousness
1,921,"now , let policymak reserv bandwidth . use lat...",Conscientiousness
2,922,"ca n't anyth , parent allow even though mani w...",Conscientiousness
3,978,"today feel becom doctor worthwhil . otherwis ,...",Conscientiousness
4,595,"gain forc , govern machineri must work properl...",Conscientiousness
...,...,...,...
2392,2464,alway posit . back wall ? defend thi ?,Openness
2393,2455,"brother say gopalganj , polic say happen gopal...",Openness
2394,2629,watch man sinc 12 year,Openness
2395,2597,"easi defeat man , difficult win ...",Openness


In [14]:
# Feature extraction using bag-of-words model
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(train_df['status_text'])
X_test_counts = count_vectorizer.transform(test_df['status_text'])

In [15]:
# Feature extraction using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['status_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['status_text'])

In [16]:
# Build a Multinomial Naive Bayes model using bag-of-words features
mnb_counts_model = MultinomialNB()
mnb_counts_model.fit(X_train_counts, train_df['label'])

MultinomialNB()

In [17]:
# Evaluate the model on the testing set using bag-of-words features
y_pred_counts = mnb_counts_model.predict(X_test_counts)

In [18]:
accuracy_counts = accuracy_score(test_df['label'], y_pred_counts)

In [19]:
print("Accuracy using bag-of-words features:", accuracy_counts)

Accuracy using bag-of-words features: 0.3449419568822554


In [20]:
# Build a Multinomial Naive Bayes model using TF-IDF features
mnb_tfidf_model = MultinomialNB()
mnb_tfidf_model.fit(X_train_tfidf, train_df['label'])

MultinomialNB()

In [21]:
# Evaluate the model on the testing set using TF-IDF features
y_pred_tfidf = mnb_tfidf_model.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(test_df['label'], y_pred_tfidf)
print("Accuracy using TF-IDF features:", accuracy_tfidf)

Accuracy using TF-IDF features: 0.32172470978441126


In [22]:
# Build a logistic regression model using TF-IDF features
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set using TF-IDF features
y_pred_logreg = logreg_model.predict(X_test_tfidf)
accuracy_logreg = accuracy_score(test_df['label'], y_pred_logreg)
print("Accuracy using logistic regression and TF-IDF features:", accuracy_logreg)


Accuracy using logistic regression and TF-IDF features: 0.31840796019900497


In [23]:
# Preprocess the text input
input_text = "I have a solution for this problem"
input_text = input_text.lower()
input_text = ' '.join([word for word in input_text.split() if word not in stop_words])
input_text = word_tokenize(input_text)
input_text = [stemmer.stem(word) for word in input_text]
# input_text = [lemmatizer.lemmatize(word) for word in input_text]
input_text = ' '.join(input_text)

# Extract features from the preprocessed text input
X_input = tfidf_vectorizer.transform([input_text])

# Predict the label of the input text using the logistic regression model
y_pred_input = logreg_model.predict(X_input)[0]

# Print the predicted label
print("Predicted label for input text:", y_pred_input)

Predicted label for input text: Neuroticism


In [24]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, train_df['label'])

LogisticRegression()

In [25]:
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train_tfidf, train_df['label'])

RandomForestClassifier()

In [26]:
# Ensemble the models
ensemble_model = VotingClassifier(estimators=[('mnb_counts', mnb_counts_model), 
                                               ('mnb_tfidf', mnb_tfidf_model), 
                                               ('logreg', logreg_model), 
                                               ('rf', rf_model)], 
                                   voting='hard')

In [27]:
ensemble_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set
y_pred_ensemble = ensemble_model.predict(X_test_tfidf)
accuracy_ensemble = accuracy_score(test_df['label'], y_pred_ensemble)
print("Accuracy using ensemble of models:", accuracy_ensemble)

Accuracy using ensemble of models: 0.33665008291873966


In [28]:
# Preprocess the text input
inp_txt = "I have a solution for this problem"
inp_txt = inp_txt.lower()
inp_txt = ' '.join([word for word in inp_txt.split() if word not in stop_words])
inp_txt = word_tokenize(inp_txt)
inp_txt = [stemmer.stem(word) for word in inp_txt]
# inp_txt = [lemmatizer.lemmatize(word) for word in inp_txt]
inp_txt = ' '.join(inp_txt)

# Extract features from the preprocessed text input
X_inp = tfidf_vectorizer.transform([inp_txt])

# Predict the label of the input text using the logistic regression model
y_pred_inp = ensemble_model.predict(X_inp)[0]

# Print the predicted label
print("Predicted label for input text:", y_pred_inp)

Predicted label for input text: Neuroticism


#### Precision, recall, accuracy, F1 score : Comparative Study + Tabular form