In [7]:
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
df = pd.read_csv('train_data.csv')

In [8]:
def normalize_persian(text):
    # Define a function to normalize Persian characters
    persian_digits = '۰۱۲۳۴۵۶۷۸۹٦'
    english_digits = '01234567896'
    yeh_characters = ['ی', 'ي', 'ے', 'ئ', 'ى']

    text = text.translate(str.maketrans(persian_digits, english_digits))
    for char in yeh_characters:
        text = text.replace(char, 'ی')

    return text

In [9]:
def preprocess_text(text):
    # Normalize Persian characters
    text = normalize_persian(text)

    # Convert to lowercase
    text = text.lower()

    # Add more preprocessing steps as needed

    return text

In [10]:
# Preprocess the text data
df['Sentence'] = df['Sentence'].apply(preprocess_text)
encoder = OrdinalEncoder()
# Fit and transform the 'emotion' column
df['emotion'] = encoder.fit_transform(df[['Emotion']])

# Extract features using TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Sentence'])

In [11]:
param_grids = {
    'Decision Tree': {'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 15, None]},
}

In [12]:
for model_name, model_class in [
    ('Decision Tree', DecisionTreeClassifier),
]:
    print(f'Optimizing {model_name} model...')
    model = model_class()
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=['accuracy', 'f1_weighted'], refit='accuracy')
    grid_search.fit(X_tfidf, df['emotion'])

    #print(f'{model_name} - Best Accuracy: {grid_search.best_score_["accuracy"]:.2f}')
    #print(f'{model_name} - Best F1-score: {grid_search.best_score_["f1_weighted"]:.2f}')
    print(f'{model_name} - Best Hyperparameters: {grid_search.best_params_}')

Optimizing Decision Tree model...
Decision Tree - Best Hyperparameters: {'criterion': 'gini', 'max_depth': None}


In [13]:
from sklearn.metrics import accuracy_score, f1_score
model = DecisionTreeClassifier(criterion='gini', max_depth=None)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, test_index in skf.split(X_tfidf, df['emotion']):
    X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = df['emotion'].iloc[train_index], df['emotion'].iloc[test_index]

    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print(f'Average Accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}')
print(f'Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}')

Average Accuracy: 0.53
Average F1-score: 0.53


In [14]:
param_grids = {
    'Random Forest': {'n_estimators': [50, 100, 150], 'max_depth': [5, 10, 15, None]},
}

In [15]:
for model_name, model_class in [
    ('Random Forest', RandomForestClassifier),
]:
    print(f'Optimizing {model_name} model...')
    model = model_class()
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=['accuracy', 'f1_weighted'], refit='accuracy')
    grid_search.fit(X_tfidf, df['emotion'])

    #print(f'{model_name} - Best Accuracy: {grid_search.best_score_["accuracy"]:.2f}')
    #print(f'{model_name} - Best F1-score: {grid_search.best_score_["f1_weighted"]:.2f}')
    print(f'{model_name} - Best Hyperparameters: {grid_search.best_params_}')

Optimizing Random Forest model...
Random Forest - Best Hyperparameters: {'max_depth': None, 'n_estimators': 100}


In [16]:
from sklearn.metrics import accuracy_score, f1_score
model = RandomForestClassifier(n_estimators=100, max_depth=None)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, test_index in skf.split(X_tfidf, df['emotion']):
    X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = df['emotion'].iloc[train_index], df['emotion'].iloc[test_index]

    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print(f'Average Accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}')
print(f'Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}')

Average Accuracy: 0.58
Average F1-score: 0.57
