In [1]:
import pandas as pd
data = pd.read_csv('labeled-sa-dataset.csv')
data.head()

Unnamed: 0,bangla,classes,labels
0,যত ভয়ের ব্যাপারটি নিয়ে মজা করছি তত ভয় লাগছে,fear,2
1,দ্য স্যাক্রেড ব্যান্ড অফ থিবস জনের দুর্ধর...,sadness,4
2,আকবর আলীর ভবিষ্যতে কি লেখা আছে জানিনা তবে তার...,joy,3
3,আমাদের সমাজ শেষ করে দিছে যারা সেবন করে তারাও...,fear,2
4,আমার ক্ষেত্রে ও এমন হয়েছিল যেখানে ফেল যাওয়ার...,sadness,4


In [7]:
data.shape

(5667, 3)

In [8]:
data.isnull().sum()

bangla     0
classes    0
labels     0
dtype: int64

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score

In [3]:
X = data.bangla
y = data.labels

train_data, test_data, train_labels, test_labels = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [4]:
lr_model = LogisticRegressionCV(class_weight='balanced', max_iter=400)
dt_model = DecisionTreeClassifier(class_weight='balanced')
rf_model = RandomForestClassifier(class_weight='balanced')
mnb_model = MultinomialNB()
svm_model = SVC(class_weight='balanced')
knn_model = KNeighborsClassifier()
ada_model = AdaBoostClassifier()
model_names = ['Logistic Regression','Decision Tree','Random Forest','Naive Bayes', 'SVM', 'KNN', 'AdaBoost']
ml_models = [lr_model,dt_model,rf_model,mnb_model, svm_model, knn_model, ada_model]

In [5]:
def model_train(model, train_data, train_labels):
	pipe = Pipeline([
        ('FE', TfidfVectorizer(ngram_range=(1,2), use_idf = True, tokenizer=lambda x: x.split())),  
        ('Transformer', TfidfTransformer()),  
        ('M', model)
    ])
	pipe.fit(train_data, train_labels)
	return pipe

trained_ml_models = []
for i in ml_models:
    tt = model_train(i,train_data, train_labels)
    trained_ml_models.append(tt)

In [6]:
ac_list = []
pr_list = []
re_list = []
f1_list = []


def predict_results(md, x_test, y_test, name):
    pred_y = md.predict(x_test)
    ac_list.append((round(accuracy_score(y_test,pred_y),4)*100))
    pr_list.append((round(precision_score(y_test, pred_y, average='weighted'),4)*100))
    re_list.append((round(recall_score(y_test, pred_y, average='weighted'),4)*100))
    f1_list.append((round(f1_score(y_test, pred_y, average='weighted'),4)*100))

for i in range(len(trained_ml_models)):
    md = trained_ml_models[i]
    name = model_names[i]
    predict_results(md, test_data, test_labels, name)

performance_matrix = pd.DataFrame({'Accuracy':ac_list,'Precision':pr_list,
                                'Recall':re_list,'F1 Score':f1_list},
                                index =model_names)
performance_matrix

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,58.82,58.55,58.82,58.31
Decision Tree,37.48,38.4,37.48,37.85
Random Forest,50.18,52.48,50.18,48.77
Naive Bayes,48.24,59.24,48.24,44.39
SVM,52.29,64.91,52.29,51.57
KNN,46.38,46.46,46.38,45.79
AdaBoost,42.06,53.6,42.06,40.58
