In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
df1 = pd.read_excel('bng2eng2/train/ConscientiousnessTrain.xlsx')
df2 = pd.read_excel('bng2eng2/train/AgreeablenessTrain.xlsx')
df3 = pd.read_excel('bng2eng2/train/NeuroticismTrain.xlsx')
df4 = pd.read_excel('bng2eng2/train/ExtroversionTrain.xlsx')
df5 = pd.read_excel('bng2eng2/train/OpennessTrain.xlsx')
train_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
train_df = train_df.drop("status", axis='columns')
train_df

df6 = pd.read_excel('bng2eng2/test/ConscientiousnessTest.xlsx')
df7 = pd.read_excel('bng2eng2/test/AgreeablenessTest.xlsx')
df8 = pd.read_excel('bng2eng2/test/NeuroticismTest.xlsx')
df9 = pd.read_excel('bng2eng2/test/ExtroversionTest.xlsx')
df10 = pd.read_excel('bng2eng2/test/OpennessTest.xlsx')
test_df = pd.concat([df6, df7, df8, df9, df10], ignore_index=True)
test_df = test_df.drop("status", axis='columns')
test_df

# Data preprocessing
# Convert text to lowercase
train_df['status_text'] = train_df['status_text'].apply(lambda x: x.lower())

print(test_df['status_text'].dtype)

test_df['status_text'] = test_df['status_text'].apply(lambda y: str(y).lower())
test_df

# Remove stopwords
stop_words = set(stopwords.words('english'))
train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
test_df['status_text'] = test_df['status_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
train_df

# Tokenization
train_df['status_text'] = train_df['status_text'].apply(lambda x: word_tokenize(x))
test_df['status_text'] = test_df['status_text'].apply(lambda x: word_tokenize(x))
train_df

# # Stemming
# stemmer = PorterStemmer()
# train_df['status_text'] = train_df['status_text'].apply(lambda x: [stemmer.stem(word) for word in x])
# test_df['status_text'] = test_df['status_text'].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
train_df['status_text'] = train_df['status_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
test_df['status_text'] = test_df['status_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

test_df

# Convert list of tokens back to text
train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join(x))
test_df['status_text'] = test_df['status_text'].apply(lambda x: ' '.join(x))
train_df

# Feature extraction using bag-of-words model
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(train_df['status_text'])
X_test_counts = count_vectorizer.transform(test_df['status_text'])

# Feature extraction using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['status_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['status_text'])

# Build a Multinomial Naive Bayes model using bag-of-words features
mnb_counts_model = MultinomialNB()
mnb_counts_model.fit(X_train_counts, train_df['label'])

# Evaluate the model on the testing set using bag-of-words features
y_pred_counts = mnb_counts_model.predict(X_test_counts)

accuracy_counts = accuracy_score(test_df['label'], y_pred_counts)

print("Accuracy using bag-of-words features:", accuracy_counts)

# Build a Multinomial Naive Bayes model using TF-IDF features
mnb_tfidf_model = MultinomialNB()
mnb_tfidf_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set using TF-IDF features
y_pred_tfidf = mnb_tfidf_model.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(test_df['label'], y_pred_tfidf)
print("Accuracy using TF-IDF features:", accuracy_tfidf)

# Build a logistic regression model using TF-IDF features
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set using TF-IDF features
y_pred_logreg = logreg_model.predict(X_test_tfidf)
accuracy_logreg = accuracy_score(test_df['label'], y_pred_logreg)
print("Accuracy using logistic regression and TF-IDF features:", accuracy_logreg)


# Preprocess the text input
input_text = "I have a solution for this problem"
input_text = input_text.lower()
input_text = ' '.join([word for word in input_text.split() if word not in stop_words])
input_text = word_tokenize(input_text)
input_text = [stemmer.stem(word) for word in input_text]
input_text = ' '.join(input_text)

# Extract features from the preprocessed text input
X_input = tfidf_vectorizer.transform([input_text])

# Predict the label of the input text using the logistic regression model
y_pred_input = logreg_model.predict(X_input)[0]

# Print the predicted label
print("Predicted label for input text:", y_pred_input)

logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, train_df['label'])

rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train_tfidf, train_df['label'])

# Ensemble the models
ensemble_model = VotingClassifier(estimators=[('mnb_counts', mnb_counts_model), 
                                               ('mnb_tfidf', mnb_tfidf_model), 
                                               ('logreg', logreg_model), 
                                               ('rf', rf_model)], 
                                   voting='hard')

ensemble_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model on the testing set
y_pred_ensemble = ensemble_model.predict(X_test_tfidf)
accuracy_ensemble = accuracy_score(test_df['label'], y_pred_ensemble)
print("Accuracy using ensemble of models:", accuracy_ensemble)

# Preprocess the text input
inp_txt = "I have a solution for this problem"
inp_txt = inp_txt.lower()
inp_txt = ' '.join([word for word in inp_txt.split() if word not in stop_words])
inp_txt = word_tokenize(inp_txt)
inp_txt = [stemmer.stem(word) for word in inp_txt]
inp_txt = ' '.join(inp_txt)

# Extract features from the preprocessed text input
X_inp = tfidf_vectorizer.transform([inp_txt])

# Predict the label of the input text using the logistic regression model
y_pred_inp = ensemble_model.predict(X_inp)[0]

# Print the predicted label
print("Predicted label for input text:", y_pred_inp)

In [1]:
#### Precision, recall, accuracy, F1 score : Comparative Study + Tabular form

Unnamed: 0,status_id,status_text,label
0,792,I will not eat sugar from today. I will use th...,Conscientiousness
1,921,"For now, let policymakers reserve the bandwidt...",Conscientiousness
2,922,"We can't do anything, our parents don't allow ...",Conscientiousness
3,978,Today I feel that becoming a doctor is worthwh...,Conscientiousness
4,595,"There will be no gain by forcing, the governme...",Conscientiousness
...,...,...,...
2392,2464,We were always positive. Now what is the back ...,Openness
2393,2455,"Brother if I say I am Gopalganj, and the polic...",Openness
2394,2629,I have been watching the man since 12 years,Openness
2395,2597,"It is very easy to defeat a man, but it is ver...",Openness


I have the python code that trains a few models and uses an ensemble to classify textual social media posts int one of the Big Five personality traits. But the accuracy of the ensemble is stuck at 35%. Data available to me is limited. Could you please inspect the code and tell me which of the above hyperparameter tunings I sholud apply. Help me by modifying my python code accordingly. 

The code:
