In [18]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
import joblib
import pickle

def fitting_vectorizer():
    df1 = pd.read_excel('bng2eng2/train/ConscientiousnessTrain.xlsx')
    df2 = pd.read_excel('bng2eng2/train/AgreeablenessTrain.xlsx')
    df3 = pd.read_excel('bng2eng2/train/NeuroticismTrain.xlsx')
    df4 = pd.read_excel('bng2eng2/train/ExtroversionTrain.xlsx')
    df5 = pd.read_excel('bng2eng2/train/OpennessTrain.xlsx')
    train_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
    train_df = train_df.drop("status", axis='columns')
    #print(train_df)

    # Preprocess the text input
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    train_df['status_text'] = train_df['status_text'].apply(lambda x: x.lower())
    train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    train_df['status_text'] = train_df['status_text'].apply(lambda x: word_tokenize(x))
    train_df['status_text'] = train_df['status_text'].apply(lambda x: [stemmer.stem(word) for word in x])
    train_df['status_text'] = train_df['status_text'].apply(lambda x: ' '.join(x))
    # Extract features from the preprocessed text input
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['status_text'])
    print("This is fitting_tfidf")
    print(tfidf_vectorizer)
    return tfidf_vectorizer

def preprocess_and_predict(input_text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    filename = "Ensemble_model1.joblib"

    # Preprocessing steps (customize as per your requirements)
    #preprocessed_text = input_text.lower()  # Convert to lowercase, for example
    inp_txt = input_text
    inp_txt = inp_txt.lower()
    inp_txt = ' '.join([word for word in inp_txt.split() if word not in stop_words])
    inp_txt = word_tokenize(inp_txt)
    inp_txt = [stemmer.stem(word) for word in inp_txt]
    inp_txt = ' '.join(inp_txt)

    # Feature extraction using  Bag of Words
    vectorizer_bow = CountVectorizer()

    # Load the saved fitted TF-IDF vectorizer
    try:
        vectorizer_tfidf = fitting_vectorizer()
        print("This is preprcess and predict")
        print(vectorizer_tfidf)
    except FileNotFoundError:
        print("Fitted vectorizer file not found.")
        return None
    except Exception as e:
        print(f"Error loading fitted vectorizer: {str(e)}")
        return None    
    
    # Load the .joblib model file
    try:
        my_model = joblib.load(filename)
    except FileNotFoundError:
        print("Model file not found.")
        return None
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None

    # Perform prediction using the loaded model
    try:
        # Preprocess and extract features from the input text
        features_tfidf = vectorizer_tfidf.transform([inp_txt])
        features_bow = vectorizer_bow.transform([inp_txt])

        # Combine the features if required
        #features = features_tfidf  # Customize as per your needs

        # Perform prediction
        prediction = my_model.predict(features_tfidf)
        return prediction[0]  # Assuming single prediction for a single input
    except Exception as e:
        print(f"Error during prediction: {str(e)}")
        return None
    

In [19]:
input_text = "This is an example sentence."
prediction = preprocess_and_predict(input_text)

if prediction is not None:
    print(f"The predicted class or label is: {prediction}")

This is fitting_tfidf
TfidfVectorizer()
This is preprcess and predict
TfidfVectorizer()
Error during prediction: Vocabulary not fitted or provided


In [1]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
import joblib

def load_training_data():
    df1 = pd.read_excel('bng2eng2/train/ConscientiousnessTrain.xlsx')
    df2 = pd.read_excel('bng2eng2/train/AgreeablenessTrain.xlsx')
    df3 = pd.read_excel('bng2eng2/train/NeuroticismTrain.xlsx')
    df4 = pd.read_excel('bng2eng2/train/ExtroversionTrain.xlsx')
    df5 = pd.read_excel('bng2eng2/train/OpennessTrain.xlsx')
    train_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
    train_df = train_df.drop("status", axis='columns')
    return train_df

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = word_tokenize(text)
    text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)
    return text

def extract_features(train_df):
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['status_text'])
    return tfidf_vectorizer

def load_model(filename):
    try:
        my_model = joblib.load(filename)
        return my_model
    except FileNotFoundError:
        print("Model file not found.")
        return None
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None

def predict_sentiment(input_text, tfidf_vectorizer, my_model):
    try:
        inp_txt = preprocess_text(input_text)
        features_tfidf = tfidf_vectorizer.transform([inp_txt])
        prediction = my_model.predict(features_tfidf)
        return prediction[0]
    except Exception as e:
        print(f"Error during prediction: {str(e)}")
        return None

def preprocess_and_predict(input_text):
    train_df = load_training_data()
    tfidf_vectorizer = extract_features(train_df)
    my_model = load_model("Ensemble_model1.joblib")
    if my_model is None:
        return None
    prediction = predict_sentiment(input_text, tfidf_vectorizer, my_model)
    return prediction

In [2]:
input_text = "This is an example sentence."
prediction = preprocess_and_predict(input_text)

if prediction is not None:
    print(f"The predicted class or label is: {prediction}")
elif prediction is None:
    print("No output")

Error during prediction: X has 5186 features, but MultinomialNB is expecting 3908 features as input.
No output
