In [2]:
import nltk
import pandas as pd
import spacy
import re
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTENC
from sklearn.impute import SimpleImputer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load required NLTK resources
for resource in ['vader_lexicon', 'punkt', 'stopwords']:
    try:
        nltk.data.find(f'corpora/{resource}')
    except LookupError:
        nltk.download(resource)

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    """Cleans and tokenizes text."""
    text = str(text).lower()
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|@\S+', '', text)  # Remove URLs and mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only alpha characters and spaces
    text = ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn')  # Remove diacritics

    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

def load_data(file_path):
    return pd.read_csv(file_path, encoding='utf-8')

def process_data():
    """Loads and processes the dataset."""
    file_path = r"/en_train.csv"
    try:
        df = load_data(file_path)
        df['text'] = df['text'].apply(preprocess_text)

        if 'binary' in df.columns:
            df['binary'] = df['binary'].map({'Hope': 1, 'Not Hope': 0}).fillna(-1).astype(int)
            df = df[df['binary'] != -1]  # Remove invalid labels

        if 'multiclass' in df.columns:
            df['multiclass'] = df['multiclass'].map({
                'Not Hope': 0, 'Generalized Hope': 1, 'Realistic Hope': 2, 'Unrealistic Hope': 3, 'Sarcasm': 4
            }).fillna(-1).astype(int)
            df = df[df['multiclass'] != -1]

        df.dropna(subset=['text', 'binary', 'multiclass'], inplace=True)
        df['text'] = df['text'].astype(str)
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None

def extract_sentiment_features(text):
    """Extracts sentiment polarity features."""
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return [scores['compound'], scores['pos'], scores['neg'], scores['neu']]

def train_data(df, task='binary'):
    """Trains a model using TF-IDF and RandomForest for either binary or multi-class classification."""
    # Define the feature (X) and target (Y) columns
    X = df['text']

    # Select target based on the task
    if task == 'binary':
        Y = df['binary']
    elif task == 'multiclass':
        Y = df['multiclass']
    else:
        raise ValueError("Invalid task. Choose either 'binary' or 'multiclass'.")

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Apply TF-IDF to the text data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Extract sentiment features
    train_sentiment = pd.DataFrame(X_train.apply(extract_sentiment_features).tolist(), index=X_train.index)
    test_sentiment = pd.DataFrame(X_test.apply(extract_sentiment_features).tolist(), index=X_test.index)

    # Combine TF-IDF and sentiment features
    X_train_combined = pd.concat([pd.DataFrame(X_train_tfidf.toarray()), train_sentiment.reset_index(drop=True)], axis=1)
    X_test_combined = pd.concat([pd.DataFrame(X_test_tfidf.toarray()), test_sentiment.reset_index(drop=True)], axis=1)

    # Handle class imbalance using SMOTENC
    categorical_features = [X_train_combined.shape[1] - 4]  # Last 4 columns are sentiment features
    smote = SMOTENC(categorical_features=categorical_features, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train)

    # Fit SimpleImputer to handle missing values
    imputer = SimpleImputer(strategy="mean")
    X_train_imputed = imputer.fit_transform(X_train_resampled)
    X_test_imputed = imputer.transform(X_test_combined)

    # Train Random Forest Model
    pipeline = Pipeline([('classifier', RandomForestClassifier(random_state=42))])
    param_grid = {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [None, 10, 20]}
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', verbose=1)
    grid_search.fit(X_train_imputed, y_train_resampled)

    print(f"Best Parameters for {task}:", grid_search.best_params_)
    print(f"Best Score for {task}:", grid_search.best_score_)

    # Evaluate Model
    y_pred = grid_search.best_estimator_.predict(X_test_imputed)
    print(f"Accuracy ({task}): {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report ({task}):\n{classification_report(y_test, y_pred)}")

    return vectorizer, imputer, grid_search.best_estimator_


def predict_label(model, text, vectorizer, imputer):
    """Predicts the label of a single text entry."""
    preprocessed_text = preprocess_text(text)
    sentiment_features = extract_sentiment_features(text)

    text_tfidf = vectorizer.transform([preprocessed_text])
    text_combined = pd.concat([pd.DataFrame(text_tfidf.toarray()), pd.DataFrame([sentiment_features])], axis=1)
    text_imputed = imputer.transform(text_combined)
    return model.predict(text_imputed)[0]

def predict_from_excel(model, excel_file, output_column, vectorizer, imputer):
    """Loads an Excel file and applies predictions."""
    df = pd.read_csv(excel_file)
    df.columns = df.columns.str.lower()
    if 'text' not in df.columns:
        raise ValueError("Excel file must contain a 'text' column.")

    df['Processed_text'] = df['text'].apply(preprocess_text)
    df[output_column] = df['Processed_text'].apply(lambda x: predict_label(model, x, vectorizer, imputer))
    df.rename(columns={'text': 'Text'}, inplace=True)
    return df


if __name__ == "__main__":
    df = process_data()
    method = int(input("How to train the model.\n1. Binary\n 2. Multiclass\n Your Response:"))
    if(method == 1):
        vec, imp, model = train_data(df.copy(),"binary")
    elif(method == 2):
        vec, imp, model = train_data(df.copy(),"multiclass")
    excel_file = "/en_test_without_labels.csv"
    df1 = predict_from_excel(model, "/en_test_without_labels.csv", "Tag", vec, imp)
    if(method == 1):
        df1['Tag'] = df1['Tag'].map({1 : 'Hope', 0 : 'Not Hope' }).fillna(-1).astype(str)
    elif(method == 2):
        df1['Tag'] = df1['Tag'].map({0 : 'Not Hope', 1 : 'Generalized Hope' , 2 : 'Realistic Hope', 3: 'Unrealistic Hope' , 4 : 'Sarcasm' }).fillna(-1).astype(str)
    del df1['Processed_text']
    df1.to_csv(excel_file, index=False)
    print(f"Predictions saved to en_test_without_labels.csv")
    print("Successfully Executed")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


How to train the model.
1. Binary
 2. Multiclass
 Your Response:2
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best Parameters for multiclass: {'classifier__max_depth': None, 'classifier__n_estimators': 200}
Best Score for multiclass: 0.8855445337241866
Accuracy (multiclass): 0.7000955109837631
Classification Report (multiclass):
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       482
           1       0.52      0.61      0.56       232
           2       0.32      0.08      0.13       102
           3       0.51      0.35      0.42        91
           4       0.91      0.90      0.91       140

    accuracy                           0.70      1047
   macro avg       0.61      0.56      0.57      1047
weighted avg       0.67      0.70      0.67      1047

Predictions saved to en_test_without_labels.csv
Successfully Executed
