In [351]:
import re
import pandas as pd
import nltk
import numpy as np

In [352]:
df = pd.read_csv('C:/Users/HP Notebook/Desktop/sentiment analysis/data/sentiment_sample.csv')

df.head(2)

Unnamed: 0,Waktu,Alamat Email,Nama Siswa,Tanggal,Hari,Program,Materi,Background Siswa,Kualitas Materi,Feedback Materi,Kualitas Mentor,Feedback Mentor,Sesi Curhat,Batch,Sentiment_Materi,Sentiment_Mentor
0,2024-01-22 15:03:06,nafidanuruhidayati@gmail.com,Nafida Nurhidayati,2024-01-22,Day 1,Python,Effective Data Visualization,Linier,4,sangat mudah dimengerti,4,"Penjelasan mudah dipahami, sangat membantu tem...",,Batch 1,Positive,Positive
1,2024-01-22 15:03:22,riskydevandra1453@gmail.com,Risky Devandra Hartana,2024-01-22,Day 1,Python,Effective Data Visualization,Linier,4,,4,,,Batch 1,Neutral,Neutral


In [353]:
df['Sentiment_Mentor'].value_counts()

Positive    347
Neutral     137
Negative     16
Name: Sentiment_Mentor, dtype: int64

# **Data Preprocessing**
---

**Data Cleaning**

In [354]:
def cleaning(data):
    """
    This function cleans the input DataFrame by handling missing values,
    removing duplicates, and converting the 'Feedback Mentor' column to lowercase.

    Parameters:
    -----------
    data: pd.DataFrame.
        Input data containing feedback and other columns.

    Returns:
    --------
    data_mentor: pd.DataFrame.
        Cleaned DataFrame containing relevant columns.
    """
    # Check data not given feedback
    nulls_before = data['Feedback Mentor'].isnull().sum()
    print('Null before nan change to neutral', nulls_before)

    # Replace NaN data with 'Neutral'
    columns_to_fill = ['Feedback Mentor']
    for column in columns_to_fill:
        data[column].fillna('Neutral', inplace=True)

    nulls_after = data['Feedback Mentor'].isnull().sum()
    print('Null before nan change to neutral', nulls_after)

    print('Validation Nan:', (data['Feedback Mentor'] == 'Neutral').sum())

    # Count duplicate rows
    print('Number of duplicate rows:', data.duplicated().sum())

    # Drop duplicate rows
    data = data.drop_duplicates(keep='first')

    # Lowercase 'Feedback Mentor' column
    data['Feedback Mentor'] = data['Feedback Mentor'].str.lower()

    return data

In [355]:
data_mentor = cleaning(data = df)

Null before nan change to neutral 46
Null before nan change to neutral 0
Validation Nan: 46
Number of duplicate rows: 0


In [356]:
# Select relevant columns
data_mentor = data_mentor[['Feedback Mentor', 'Sentiment_Mentor']]

In [357]:
# sanicheck
data_mentor.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
Feedback Mentor,"penjelasan mudah dipahami, sangat membantu tem...",neutral,neutral,menyenangkan,"baik dan asik, tapi mungkin saat penyampaian m...","sudah bagus, hanya perjelas istilah yang masih...",saran penyampaian materinya lebih pelan dan ko...,"mentornya asik, penjelasannya mudah dipahami, ...","kak, mungkin bisa pakai google colab untuk sel...",pemateri sangat asik menyampaikan isi materi h...,...,hari ini mentor menjelaskan sangat mudah dipahami,hebat,mantap,keren,makasih banyak kak,mentor memberikan studi kasus untuk dikerjakan...,bagus good,👌,..,nice
Sentiment_Mentor,Positive,Neutral,Neutral,Positive,Positive,Neutral,Negative,Negative,Neutral,Positive,...,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Neutral,Neutral,Positive


In [358]:
def remove_digits(text):
    """
    Remove digits from text

    Parameters:
    -----------
    text: str
        Input text

    Return:
    -------
    str
        Text after removing digits
    """
    if isinstance(text, str):
        return re.sub(r'\d', '', text)
    else:
        return text

In [359]:
# Applying the function to 'mentor feedback' columns
data_mentor['Feedback Mentor'] = data_mentor['Feedback Mentor'].apply(remove_digits)

In [360]:
# validation digits
if not data_mentor['Feedback Mentor'].astype(str).str.contains('\d').any():
    print("Columns'Feedback Mentor' do not contain digits.")
else:
    print("Column 'Feedback Mentor' has digit")

Columns'Feedback Mentor' do not contain digits.


In [361]:
def remove_punctuation(text):
    """
    Function for removing punctuation from text

    Parameters:
    -----------
    text: str
        Input text

    Return:
    -------
    str
        Text after removing punctuation
    """
    if isinstance(text, str):
        # Use regular expression to remove punctuation
        text_without_punctuation = re.sub(r'[^\w\s]', '', text)
        return text_without_punctuation
    else:
        return text

In [362]:
# Applying the function to 'Feedback Mentor'
data_mentor['Feedback Mentor'] = data_mentor['Feedback Mentor'].apply(remove_punctuation)

In [363]:
# validation punctuation
if not isinstance(data_mentor['Feedback Mentor'], str):
    print("Columns'Feedback Mentor' do not contain punctuation.")
else:
    print("Column 'Feedback Mentor' has punctuation")

Columns'Feedback Mentor' do not contain punctuation.


## **Split data**
---

In [364]:
from sklearn.model_selection import train_test_split

def split_data(data, target_column):
    """
    Function for splitting data into train, test, and validation sets.

    Parameters:
    -----------
    data: pd.DataFrame
        Input data for splitting.

    Returns:
    --------
    X_train : pd.DataFrame
        Predictor data for the training set.

    X_test : pd.DataFrame
        Predictor data for the testing set.

    X_valid : pd.DataFrame
        Predictor data for the validation set.

    y_train : pd.Series
        Target data for the training set.

    y_test : pd.Series
        Target data for the testing set.

    y_valid : pd.Series
        Target data for the validation set.
    """
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=[target_column]),
                                                        data[target_column],
                                                        test_size=0.2,
                                                        random_state=42)

    # Further split the test set into test and validation sets
    X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test,
                                                        test_size=0.5,
                                                        random_state=42)

    return X_train, X_test, X_valid, y_train, y_test, y_valid

In [365]:
# Call the function with your DataFrame
X_train_mentor, X_test_mentor, X_valid_mentor, y_train_mentor, y_test_mentor, y_valid_mentor = split_data(data = data_mentor, target_column = 'Sentiment_Mentor')

In [366]:
# sanicheck shape of feedback mentor
print('Shape X_train mentor', X_train_mentor.shape)
print('Shape X_valid mentor', X_test_mentor.shape)
print('Shape X_test mentor', X_valid_mentor.shape)

Shape X_train mentor (400, 1)
Shape X_valid mentor (50, 1)
Shape X_test mentor (50, 1)


Change data target to numeric

In [367]:
# change to array
y_train_mentor = np.array(y_train_mentor)
y_valid_mentor = np.array(y_valid_mentor)
y_test_mentor = np.array(y_test_mentor)

In [368]:
from sklearn.preprocessing import LabelEncoder

# initialize
ohe = LabelEncoder()

y_train_mentor = ohe.fit_transform(y_train_mentor)
y_valid_mentor = ohe.fit_transform(y_valid_mentor)
y_test_mentor = ohe.fit_transform(y_test_mentor)

In [369]:
y_train_mentor

array([2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 0, 1, 2, 1, 2, 2,
       2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1,
       0, 0, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2,
       2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2,
       2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 1, 1, 0, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1,
       2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2,
       2, 0, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,

In [370]:
y_valid_mentor

array([2, 0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1,
       1, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1,
       2, 1, 2, 2, 2, 2])

In [371]:
y_test_mentor

array([1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 0,
       2, 2, 2, 2, 2, 1])

## **Vectorizing Text**
---

What step will we take?
1. TF-IDF
2. Bag of Words / Count Vectorizer

### **TF-IDF**
---

In [372]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

tfidf.fit(X_train_mentor['Feedback Mentor'])

In [373]:
train_tfidf_mentor = tfidf.transform(X_train_mentor['Feedback Mentor'])
train_tfidf_mentor

<400x290 sparse matrix of type '<class 'numpy.float64'>'
	with 1066 stored elements in Compressed Sparse Row format>

In [374]:
valid_tfidf_mentor = tfidf.transform(X_valid_mentor['Feedback Mentor'])
valid_tfidf_mentor

<50x290 sparse matrix of type '<class 'numpy.float64'>'
	with 161 stored elements in Compressed Sparse Row format>

In [375]:
test_tfidf_mentor = tfidf.transform(X_test_mentor['Feedback Mentor'])
test_tfidf_mentor

<50x290 sparse matrix of type '<class 'numpy.float64'>'
	with 101 stored elements in Compressed Sparse Row format>

### **Bag of Words / Count Vectorizer**
---

Count Vectorizing resulting `min 0` values, some words that are fequent could be have high value

In [376]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer()

bow.fit(X_train_mentor['Feedback Mentor'])

In [377]:
train_bow_mentor = bow.transform(X_train_mentor['Feedback Mentor'])
train_bow_mentor

<400x290 sparse matrix of type '<class 'numpy.int64'>'
	with 1066 stored elements in Compressed Sparse Row format>

In [378]:
valid_bow_mentor = bow.transform(X_valid_mentor['Feedback Mentor'])
valid_bow_mentor

<50x290 sparse matrix of type '<class 'numpy.int64'>'
	with 161 stored elements in Compressed Sparse Row format>

In [379]:
test_bow_mentor = bow.transform(X_test_mentor['Feedback Mentor'])
test_bow_mentor

<50x290 sparse matrix of type '<class 'numpy.int64'>'
	with 101 stored elements in Compressed Sparse Row format>

# **Modeling**
---

In [380]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

**Baseline**

In [381]:
baseline_model = DummyClassifier(strategy="most_frequent")

baseline_model_cv = cross_val_score(estimator=baseline_model,
                                    X=X_train_mentor,
                                    y=y_train_mentor,
                                    cv=5).mean()

baseline_model_cv

0.7

In [382]:
class Modeling():
    """
    Class for prediction
    """
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def logit(self):
        """
        Function for create logistic regression model
        """
        logit = OneVsRestClassifier(LogisticRegression())
        logit.fit(self.X, self.y)
        y_pred = logit.predict(self.X)
        return y_pred

    def svm(self):
        """
        Function for create svm model
        """
        svm = OneVsRestClassifier(SVC())
        svm.fit(self.X, self.y)
        y_pred = svm.predict(self.X)
        return y_pred

    def random_forest(self):
        """
        Function for create random forest model
        """
        rf = OneVsRestClassifier(RandomForestClassifier(max_depth=5))
        rf.fit(self.X, self.y)
        y_pred = rf.predict(self.X)
        return y_pred

    def naive_bayes(self):
        """
        Function for create naive bayes model
        """
        nb = MultinomialNB()
        nb.fit(self.X, self.y)
        y_pred = nb.predict(self.X)
        return y_pred
    
    def adaboost(self):
        """
        Function for create Adaboost model
        """
        adaboost = AdaBoostClassifier()
        adaboost.fit(self.X, self.y)
        y_pred = adaboost.predict(self.X)
        return y_pred
    
    def knn(self):
        """
        Function for create K-Nearest Neighbors (KNN) model
        """
        knn = KNeighborsClassifier()
        knn.fit(self.X, self.y)
        y_pred = knn.predict(self.X)
        return y_pred
    
    def decision_tree(self):
        """
        Function for create Decision Tree model
        """
        dt = DecisionTreeClassifier()
        dt.fit(self.X, self.y)
        y_pred = dt.predict(self.X)
        return y_pred

In [383]:
def evaluate_model(y_true, y_pred, model_name):
    """
    Function to evaluate model using macro evaluation

    Parameters:
    ----------
    y_true : pd.Series
        Data target

    y_pred : array
        Data result after prediction

    Returns:
    --------
    metrics : pd.DataFrame
         DataFrame containing evaluation metrics
    """
    # dict metric
    metrics = {}
    metrics['Model'] = model_name
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, average='macro')
    metrics['recall'] = recall_score(y_true, y_pred, average='macro')
    metrics['f1_score'] = f1_score(y_true, y_pred, average='macro')

    return pd.DataFrame(metrics, index=[0])

def display_metric(X, y):
    """
    Function to display evaluation metrics for logistic regression, SVM, Naive Bayes, Random Forest, AdaBoost, KNN, and Decision Tree models

    Parameters:
    ----------
    X : pd.DataFrame
        Data predictors

    y : pd.Series
        Data target

    Returns:
    --------
    combined_metrics : pd.DataFrame
        Combined DataFrame containing evaluation metrics for each model
    """
    # Initialize model
    model = Modeling(X, y)

    # Logit model
    y_pred_logit = model.logit()

    # SVM model
    y_pred_svm = model.svm()

    # Naive Bayes model
    y_pred_nb = model.naive_bayes()

    # Random forest model
    y_pred_forest = model.random_forest()

    # Adaboost model
    y_pred_ada = model.adaboost()
    
    # KNN model
    y_pred_knn = model.knn()
    
    # Decision Tree model
    y_pred_dt = model.decision_tree()

    # Evaluation
    metric_logit = evaluate_model(y, y_pred_logit, model_name='Logistic Regression')
    metric_svm = evaluate_model(y, y_pred_svm, model_name='SVM')
    metric_nb = evaluate_model(y, y_pred_nb, model_name='Naive Bayes')
    metric_forest = evaluate_model(y, y_pred_forest, model_name='Random Forest')
    metric_ada = evaluate_model(y, y_pred_ada, model_name='Adaboost')
    metric_knn = evaluate_model(y, y_pred_knn, model_name='K-Nearest Neighbors')
    metric_dt = evaluate_model(y, y_pred_dt, model_name='Decision Tree')

    # Concatenate metrics into a single DataFrame
    combined_metrics = pd.concat([metric_logit,
                                  metric_svm,
                                  metric_nb,
                                  metric_forest,
                                  metric_ada,
                                  metric_knn,
                                  metric_dt], ignore_index=True)

    return combined_metrics

step-by-step:
1. Training data
2. Validation
3. Re-train
4. Testing data

## **TF-IDF Model**
---

**Training Data**

In [384]:
# call for data train
eval_traintdidf_mentor = display_metric(X = train_tfidf_mentor,
                                       y = y_train_mentor)

eval_traintdidf_mentor

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,accuracy,precision,recall,f1_score
0,Logistic Regression,0.8125,0.589697,0.472816,0.49042
1,SVM,0.975,0.984203,0.973154,0.978387
2,Naive Bayes,0.84,0.5816,0.506455,0.523193
3,Random Forest,0.715,0.903553,0.487907,0.491347
4,Adaboost,0.8125,0.929577,0.716152,0.75948
5,K-Nearest Neighbors,0.785,0.51248,0.568043,0.518891
6,Decision Tree,0.985,0.989041,0.985387,0.987182


**Validation Data**

In [385]:
# Define the Decision Tree model
model_decision_tree = DecisionTreeClassifier()

# Define parameter grid for Decision Tree
params_grid_decision_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Perform RandomizedSearchCV for Decision Tree
random_search_decision_tree = RandomizedSearchCV(
                                    model_decision_tree,
                                    param_distributions=params_grid_decision_tree,
                                    n_iter=10,
                                    cv=skf,
                                    scoring='accuracy',
                                    random_state=42
                                )

random_search_decision_tree.fit(valid_tfidf_mentor, y_valid_mentor)
best_params_decision_tree = random_search_decision_tree.best_params_
print("Best Parameters for Decision Tree:", best_params_decision_tree)



Best Parameters for Decision Tree: {'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 10, 'criterion': 'entropy'}


In [386]:
# validation
model_tree = OneVsRestClassifier(DecisionTreeClassifier(**best_params_decision_tree))
model_tree.fit(valid_tfidf_mentor, y_valid_mentor)

y_predtfidf_mentor_valid = model_tree.predict(valid_tfidf_mentor)

In [387]:
evaluate_model(y_valid_mentor, y_predtfidf_mentor_valid, model_name='Decision Tree')

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,accuracy,precision,recall,f1_score
0,Decision Tree,0.78,0.588652,0.433333,0.443003


**Re-Train Data**

In [388]:
# validation
model_tree = OneVsRestClassifier(DecisionTreeClassifier(**best_params_decision_tree))
model_tree.fit(train_tfidf_mentor, y_train_mentor)

y_predtfidf_mentor_train = model_tree.predict(train_tfidf_mentor)

In [389]:
evaluate_model(y_train_mentor, y_predtfidf_mentor_train, model_name='Decision Tree')

Unnamed: 0,Model,accuracy,precision,recall,f1_score
0,Decision Tree,0.8125,0.929577,0.716152,0.75948


**Test Data**

In [390]:
# validation
model_tree = OneVsRestClassifier(DecisionTreeClassifier(**best_params_decision_tree))
model_tree.fit(test_tfidf_mentor, y_test_mentor)

y_predtfidf_mentor_test = model_tree.predict(test_tfidf_mentor)

In [391]:
evaluate_model(y_test_mentor, y_predtfidf_mentor_test, model_name='Decision Tree')

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,accuracy,precision,recall,f1_score
0,Decision Tree,0.8,0.585366,0.5,0.509259


**Classification Report**

In [392]:
# Classification Report
target_names = ['Class 0', 'Class 1', 'Class 2']
report = classification_report(y_train_mentor,
                               y_predtfidf_mentor_train,
                               target_names=target_names)

print("Classification Report Train:\n", report)

Classification Report Train:
               precision    recall  f1-score   support

     Class 0       1.00      0.82      0.90        11
     Class 1       1.00      0.33      0.50       109
     Class 2       0.79      1.00      0.88       280

    accuracy                           0.81       400
   macro avg       0.93      0.72      0.76       400
weighted avg       0.85      0.81      0.78       400



In [393]:
# Classification Report
target_names = ['Class 0', 'Class 1', 'Class 2']
report = classification_report(y_test_mentor,
                               y_predtfidf_mentor_test,
                               target_names=target_names)

print("Classification Report Test:\n", report)

Classification Report Test:
               precision    recall  f1-score   support

     Class 0       0.00      0.00      0.00         1
     Class 1       1.00      0.50      0.67        18
     Class 2       0.76      1.00      0.86        31

    accuracy                           0.80        50
   macro avg       0.59      0.50      0.51        50
weighted avg       0.83      0.80      0.77        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## **Bag of Words / Count Vectorizer**
---

In [700]:
# call for data train
eval_bow_mentor = display_metric(X = train_bow_mentor,
                                 y = y_train_mentor)

eval_bow_mentor

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,accuracy,precision,recall,f1_score
0,Logistic Regression,0.87,0.938931,0.735734,0.805463
1,SVM,0.965,0.977368,0.962789,0.969562
2,Naive Bayes,0.87,0.934025,0.846581,0.870622
3,Random Forest,0.715,0.903553,0.487907,0.491347
4,Adaboost,0.81,0.913226,0.658604,0.720353
5,K-Nearest Neighbors,0.8225,0.527101,0.580297,0.542254
6,Decision Tree,0.985,0.989041,0.985387,0.987182


**Validation Data**

In [701]:
# Define the Decision Tree model
# model_decision_tree = DecisionTreeClassifier()
from sklearn.ensemble import GradientBoostingClassifier
model_gradient_boosting = GradientBoostingClassifier()


# Define parameter grid for Decision Tree
params_grid_decision_tree = {
    # 'criterion': ['gini', 'entropy','friedman_mse'],
    'criterion': ['entropy','friedman_mse','square_error'],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8, 10],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'n_estimators': [100, 200, 300], 
}

# Perform RandomizedSearchCV for Decision Tree
random_search_decision_tree = RandomizedSearchCV(
                                    model_gradient_boosting,
                                    param_distributions=params_grid_decision_tree,
                                    n_iter = 20,
                                    cv = skf,
                                    scoring ='accuracy',
                                    random_state = 21
                                )

random_search_decision_tree.fit(valid_bow_mentor, y_valid_mentor)
best_params_decision_tree = random_search_decision_tree.best_params_
print("Best Parameters for Decision Tree:", best_params_decision_tree)





80 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP Notebook\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\HP Notebook\anaconda3\Lib\site-packages\sklearn\ensemble\_gb.py", line 533, in fit
    raise ValueError(
ValueError: criterion='square_error' is not supported. Use criterion='friedman_mse' or 'squared_error' instead, as trees should use a squared error criterion in Gradient Boosting.

--------------------------------------------------------------------------------
45 fits failed with the f

Best Parameters for Decision Tree: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 10, 'criterion': 'friedman_mse'}




In [702]:
# validation
model_tree = OneVsRestClassifier(GradientBoostingClassifier(**best_params_decision_tree))
model_tree.fit(valid_bow_mentor, y_valid_mentor)

y_predbow_mentor_valid = model_tree.predict(valid_bow_mentor)



In [703]:
evaluate_model(y_valid_mentor, y_predbow_mentor_valid, model_name='Gradient Boosting')

Unnamed: 0,Model,accuracy,precision,recall,f1_score
0,Gradient Boosting,0.86,0.873932,0.764815,0.810159


**Re-Train Data**

In [704]:
# validation
model_tree = OneVsRestClassifier(GradientBoostingClassifier(**best_params_decision_tree))
model_tree.fit(train_bow_mentor, y_train_mentor)

y_predbow_mentor_train = model_tree.predict(train_bow_mentor)



In [705]:
evaluate_model(y_train_mentor, y_predbow_mentor_train, model_name='Gradient Boosting')

Unnamed: 0,Model,accuracy,precision,recall,f1_score
0,Gradient Boosting,0.935,0.937711,0.930597,0.931799


**Test Data**

In [706]:
# validation
model_tree = OneVsRestClassifier(GradientBoostingClassifier(**best_params_decision_tree))
model_tree.fit(test_bow_mentor, y_test_mentor)

y_predbow_mentor_test = model_tree.predict(test_bow_mentor)



In [707]:
evaluate_model(y_test_mentor, y_predbow_mentor_test, model_name='Gradient Boosting')

Unnamed: 0,Model,accuracy,precision,recall,f1_score
0,Gradient Boosting,0.84,0.931624,0.851852,0.866667


**Classification Report**

In [708]:
# Classification Report
target_names = ['Class 0', 'Class 1', 'Class 2']
report = classification_report(y_train_mentor,
                               y_predbow_mentor_train,
                               target_names=target_names)

print("Classification Report Train:\n", report)

Classification Report Train:
               precision    recall  f1-score   support

     Class 0       1.00      0.91      0.95        11
     Class 1       0.83      0.95      0.89       109
     Class 2       0.98      0.93      0.95       280

    accuracy                           0.94       400
   macro avg       0.94      0.93      0.93       400
weighted avg       0.94      0.94      0.94       400



In [709]:
# Classification Report
target_names = ['Class 0', 'Class 1', 'Class 2']
report = classification_report(y_test_mentor,
                               y_predbow_mentor_test,
                               target_names=target_names)

print("Classification Report Test:\n", report)

Classification Report Test:
               precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00         1
     Class 1       1.00      0.56      0.71        18
     Class 2       0.79      1.00      0.89        31

    accuracy                           0.84        50
   macro avg       0.93      0.85      0.87        50
weighted avg       0.87      0.84      0.83        50

