In [12]:
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier

In [None]:
df = pd.read_csv('train_data.csv')
df

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
plot_df = df.Emotion.value_counts()
plot_df.plot(kind="bar")

In [None]:
uniqueChars = set(''.join(df['Sentence']))
print(len(uniqueChars))
print(uniqueChars)

In [13]:
def normalize_persian(text):
    # Define a function to normalize Persian characters
    persian_digits = '۰۱۲۳۴۵۶۷۸۹٦'
    english_digits = '01234567896'
    yeh_characters = ['ی', 'ي', 'ے', 'ئ', 'ى']

    text = text.translate(str.maketrans(persian_digits, english_digits))
    for char in yeh_characters:
        text = text.replace(char, 'ی')

    return text

In [14]:
def preprocess_text(text):
    # Normalize Persian characters
    text = normalize_persian(text)

    # Convert to lowercase
    text = text.lower()

    # Add more preprocessing steps as needed

    return text

In [None]:
uniqueChars = set(''.join(df['Sentence']))
print(len(uniqueChars))
print(uniqueChars)

In [20]:
# Preprocess the text data
df['Sentence'] = df['Sentence'].apply(preprocess_text)
encoder = OrdinalEncoder()
# Fit and transform the 'emotion' column
df['emotion'] = encoder.fit_transform(df[['Emotion']])

# Extract features using TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Sentence'])
print(X_tfidf.shape)

(4924, 10159)


In [5]:
param_grids = {
    'Decision Tree': {'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 15, None]},
}

In [6]:
for model_name, model_class in [
    ('Decision Tree', DecisionTreeClassifier),
]:
    print(f'Optimizing {model_name} model...')
    model = model_class()
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=['accuracy', 'f1_weighted'], refit='accuracy')
    grid_search.fit(X_tfidf, df['emotion'])

    #print(f'{model_name} - Best Accuracy: {grid_search.best_score_["accuracy"]:.2f}')
    #print(f'{model_name} - Best F1-score: {grid_search.best_score_["f1_weighted"]:.2f}')
    print(f'{model_name} - Best Hyperparameters: {grid_search.best_params_}')

Optimizing Decision Tree model...
Decision Tree - Best Hyperparameters: {'criterion': 'gini', 'max_depth': None}


In [7]:
from sklearn.metrics import accuracy_score, f1_score
model = DecisionTreeClassifier(criterion='gini', max_depth=None)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, test_index in skf.split(X_tfidf, df['emotion']):
    X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = df['emotion'].iloc[train_index], df['emotion'].iloc[test_index]

    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print(f'Average Accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}')
print(f'Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}')

Average Accuracy: 0.53
Average F1-score: 0.53


In [8]:
param_grids = {
    'Random Forest': {'n_estimators': [50, 100, 150], 'max_depth': [5, 10, 15, None]},
}

In [9]:
for model_name, model_class in [
    ('Random Forest', RandomForestClassifier),
]:
    print(f'Optimizing {model_name} model...')
    model = model_class()
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=['accuracy', 'f1_weighted'], refit='accuracy')
    grid_search.fit(X_tfidf, df['emotion'])

    #print(f'{model_name} - Best Accuracy: {grid_search.best_score_["accuracy"]:.2f}')
    #print(f'{model_name} - Best F1-score: {grid_search.best_score_["f1_weighted"]:.2f}')
    print(f'{model_name} - Best Hyperparameters: {grid_search.best_params_}')

Optimizing Random Forest model...
Random Forest - Best Hyperparameters: {'max_depth': None, 'n_estimators': 100}


In [10]:
from sklearn.metrics import accuracy_score, f1_score
model = RandomForestClassifier(n_estimators=100, max_depth=None)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, test_index in skf.split(X_tfidf, df['emotion']):
    X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = df['emotion'].iloc[train_index], df['emotion'].iloc[test_index]

    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print(f'Average Accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}')
print(f'Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}')

Average Accuracy: 0.58
Average F1-score: 0.58


In [11]:
param_grids = {
    'AdaBoost': {'n_estimators': [50, 100, 150], 'learning_rate': [0.1, 0.5, 1.0]},
}

In [12]:
for model_name, model_class in [
    ('AdaBoost', AdaBoostClassifier),
]:
    print(f'Optimizing {model_name} model...')
    model = model_class()
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=['accuracy', 'f1_weighted'], refit='accuracy')
    grid_search.fit(X_tfidf, df['emotion'])

    #print(f'{model_name} - Best Accuracy: {grid_search.best_score_["accuracy"]:.2f}')
    #print(f'{model_name} - Best F1-score: {grid_search.best_score_["f1_weighted"]:.2f}')
    print(f'{model_name} - Best Hyperparameters: {grid_search.best_params_}')

Optimizing AdaBoost model...
AdaBoost - Best Hyperparameters: {'learning_rate': 1.0, 'n_estimators': 50}


In [13]:
from sklearn.metrics import accuracy_score, f1_score
model = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, test_index in skf.split(X_tfidf, df['emotion']):
    X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = df['emotion'].iloc[train_index], df['emotion'].iloc[test_index]

    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print(f'Average Accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}')
print(f'Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}')

Average Accuracy: 0.47
Average F1-score: 0.45


In [14]:
param_grids = {
    'Extra Tree': {'n_estimators': [50, 100, 150], 'criterion': ['gini', 'entropy']},
}

In [15]:
for model_name, model_class in [
    ('Extra Tree', ExtraTreesClassifier),
]:
    print(f'Optimizing {model_name} model...')
    model = model_class()
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=['accuracy', 'f1_weighted'], refit='accuracy')
    grid_search.fit(X_tfidf, df['emotion'])

    #print(f'{model_name} - Best Accuracy: {grid_search.best_score_["accuracy"]:.2f}')
    #print(f'{model_name} - Best F1-score: {grid_search.best_score_["f1_weighted"]:.2f}')
    print(f'{model_name} - Best Hyperparameters: {grid_search.best_params_}')

Optimizing Extra Tree model...
Extra Tree - Best Hyperparameters: {'criterion': 'gini', 'n_estimators': 100}


In [16]:
from sklearn.metrics import accuracy_score, f1_score
model = ExtraTreesClassifier(criterion='gini', n_estimators=100)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, test_index in skf.split(X_tfidf, df['emotion']):
    X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = df['emotion'].iloc[train_index], df['emotion'].iloc[test_index]

    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print(f'Average Accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}')
print(f'Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}')

Average Accuracy: 0.59
Average F1-score: 0.59


In [21]:
param_grids = {
    'LightGBM': {'num_leaves': [31, 63, 127], 'learning_rate': [0.1, 0.2, 0.3]}
}

In [22]:
for model_name, model_class in [
    ('LightGBM', LGBMClassifier)
]:
    print(f'Optimizing {model_name} model...')
    model = model_class()
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=['accuracy', 'f1_weighted'], refit='accuracy')
    grid_search.fit(X_tfidf, df['emotion'])

    #print(f'{model_name} - Best Accuracy: {grid_search.best_score_["accuracy"]:.2f}')
    #print(f'{model_name} - Best F1-score: {grid_search.best_score_["f1_weighted"]:.2f}')
    print(f'{model_name} - Best Hyperparameters: {grid_search.best_params_}')

Optimizing LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7474
[LightGBM] [Info] Number of data points in the train set: 3939, number of used features: 317
[LightGBM] [Info] Start training from score -1.582883
[LightGBM] [Info] Start training from score -2.665554
[LightGBM] [Info] Start training from score -1.213923
[LightGBM] [Info] Start training from score -1.360977
[LightGBM] [Info] Start training from score -1.762489
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7514
[LightGBM] [Info] Number of data points in the train set: 3939, number of used features: 318
[LightGBM] [Info] Start training from score -1.582883
[Ligh

In [23]:
from sklearn.metrics import accuracy_score, f1_score
model = LGBMClassifier(learning_rate=0.1, num_leaves=31)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, test_index in skf.split(X_tfidf, df['emotion']):
    X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = df['emotion'].iloc[train_index], df['emotion'].iloc[test_index]

    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print(f'Average Accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}')
print(f'Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7474
[LightGBM] [Info] Number of data points in the train set: 3939, number of used features: 317
[LightGBM] [Info] Start training from score -1.582883
[LightGBM] [Info] Start training from score -2.665554
[LightGBM] [Info] Start training from score -1.213923
[LightGBM] [Info] Start training from score -1.360977
[LightGBM] [Info] Start training from score -1.762489
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7514
[LightGBM] [Info] Number of data points in the train set: 3939, number of used features: 318
[LightGBM] [Info] Start 

In [24]:
param_grids = {
    'XGBoost': {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.2, 0.3]},
}

In [25]:
for model_name, model_class in [
    ('XGBoost', XGBClassifier),
]:
    print(f'Optimizing {model_name} model...')
    model = model_class()
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=['accuracy', 'f1_weighted'], refit='accuracy')
    grid_search.fit(X_tfidf, df['emotion'])

    #print(f'{model_name} - Best Accuracy: {grid_search.best_score_["accuracy"]:.2f}')
    #print(f'{model_name} - Best F1-score: {grid_search.best_score_["f1_weighted"]:.2f}')
    print(f'{model_name} - Best Hyperparameters: {grid_search.best_params_}')

Optimizing XGBoost model...
XGBoost - Best Hyperparameters: {'learning_rate': 0.3, 'max_depth': 5}


In [26]:
from sklearn.metrics import accuracy_score, f1_score
model = XGBClassifier(learning_rate=0.3, max_depth=5)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, test_index in skf.split(X_tfidf, df['emotion']):
    X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = df['emotion'].iloc[train_index], df['emotion'].iloc[test_index]

    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print(f'Average Accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}')
print(f'Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}')

Average Accuracy: 0.60
Average F1-score: 0.60


In [27]:
param_grids = {
    'Gradient Boosting': {'n_estimators': [50, 100, 150], 'learning_rate': [0.1, 0.2, 0.3]},
}

In [28]:
for model_name, model_class in [
    ('Gradient Boosting', GradientBoostingClassifier),
]:
    print(f'Optimizing {model_name} model...')
    model = model_class()
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=['accuracy', 'f1_weighted'], refit='accuracy')
    grid_search.fit(X_tfidf, df['emotion'])

    #print(f'{model_name} - Best Accuracy: {grid_search.best_score_["accuracy"]:.2f}')
    #print(f'{model_name} - Best F1-score: {grid_search.best_score_["f1_weighted"]:.2f}')
    print(f'{model_name} - Best Hyperparameters: {grid_search.best_params_}')

Optimizing Gradient Boosting model...
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Lenovo\AppData\Local\Temp\ipykernel_12176\236912663.py", line 7, in <module>
    grid_search.fit(X_tfidf, df['emotion'])
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_search.py", line 874, in fit
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_search.py", line 1388, in _run_search
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_search.py", line 821, in evaluate_candidates
    This is a method instead of a snippet in ``fit`` since it's used twice,
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Lenovo\AppData\Local\Programs\P

In [None]:
from sklearn.metrics import accuracy_score, f1_score
model = GradientBoostingClassifier(learning_rate=0.3, n_estimators=50)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, test_index in skf.split(X_tfidf, df['emotion']):
    X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = df['emotion'].iloc[train_index], df['emotion'].iloc[test_index]

    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracy_scores.append(accuracy)
    f1_scores.append(f1)

print(f'Average Accuracy: {sum(accuracy_scores) / len(accuracy_scores):.2f}')
print(f'Average F1-score: {sum(f1_scores) / len(f1_scores):.2f}')