In [1]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import joblib

In [None]:
df = pd.read_csv("../Dataset/gpreviews_2.csv")

In [3]:
df.head()

Unnamed: 0,reviewId,text,score,sentiment,labels
0,7793317c-ab30-4e9b-abcd-6e3df0a5d440,Sony Liv. one of best experiences. I changed m...,1,Positive,"['user experience', 'features']"
1,c2617457-7693-45f0-9142-079b250ef22f,Horrible app honestly for such a rich company ...,0,Negative,"['user experience', 'cost', 'performance', 'fe..."
2,dc96ac81-1311-4018-acee-588b79659125,This app very good and useful. For watching Ko...,1,Positive,"['features', 'user experience']"
3,5445cb50-f16d-4a5a-8b03-66909aa33bb8,Suddenly signed out of my account for no reaso...,0,Negative,"['login problems', 'data loss', 'user experien..."
4,8d86a93f-538d-451c-9243-f364227e3976,I've found everything here to be of my liking,1,Positive,"['features', 'user experience']"


In [4]:
import ast
df["labels"] = df["labels"].apply(ast.literal_eval)

In [5]:
df = df[df['labels'].map(len) > 0]

In [6]:
texts = df["text"].tolist()
labels = df["labels"].tolist()

In [7]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)

In [8]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english', min_df=5, max_df=0.8)
X = tfidf.fit_transform(texts)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

In [10]:
log_reg = LogisticRegression(solver = "saga")

xgboost = XGBClassifier(eval_metric='mlogloss', use_label_encoder = False)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight = "balanced")

voting = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('xgboost', xgboost),
    ('random_forest', random_forest)
], voting='soft', n_jobs=-1)

ensemble_model = OneVsRestClassifier(voting, n_jobs=-1)

In [11]:
ensemble_model.fit(X_train, y_train)

0,1,2
,estimator,VotingClassif...voting='soft')
,n_jobs,-1
,verbose,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,100

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
y_pred = ensemble_model.predict(X_test)

In [13]:
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

Classification Report:
                  precision    recall  f1-score   support

             ads       0.95      0.86      0.90       799
            cost       0.85      0.47      0.60      2023
         crashes       0.81      0.46      0.59      1468
customer support       0.78      0.23      0.36       538
       data loss       0.76      0.26      0.39      1260
        features       0.78      0.78      0.78      4320
language support       0.91      0.24      0.38       349
  login problems       0.89      0.47      0.61       827
     performance       0.76      0.69      0.72      3360
 user experience       0.90      0.99      0.94      6805
  version issues       0.78      0.30      0.43      1327

       micro avg       0.84      0.70      0.76     23076
       macro avg       0.83      0.52      0.61     23076
    weighted avg       0.83      0.70      0.73     23076
     samples avg       0.84      0.76      0.76     23076



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
print("\nPer-label Accuracy:")
for i, label in enumerate(mlb.classes_):
    acc = accuracy_score(y_test[:, i], y_pred[:, i])
    print(f"{label}: {acc:.2f}")


Per-label Accuracy:
ads: 0.98
cost: 0.84
crashes: 0.88
customer support: 0.94
data loss: 0.87
features: 0.75
language support: 0.96
login problems: 0.94
performance: 0.77
user experience: 0.89
version issues: 0.86


In [15]:
predicted_labels = mlb.inverse_transform(y_pred)
predicted_labels[:10]

[('crashes', 'user experience'),
 ('features', 'user experience'),
 ('crashes', 'performance', 'user experience'),
 ('cost', 'user experience'),
 ('features',),
 ('crashes', 'features', 'performance', 'user experience', 'version issues'),
 ('features', 'user experience'),
 ('features', 'user experience'),
 ('crashes', 'performance', 'user experience'),
 ('user experience',)]

In [None]:
joblib.dump(ensemble_model, '../Models/ensemble_model.pkl')
joblib.dump(tfidf, '../Models/tfidf.pkl')
joblib.dump(mlb, '../Models/multilabel_binarizer.pkl')

['multilabel_binarizer.pkl']