In [140]:
#import
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,ConfusionMatrixDisplay,RocCurveDisplay,classification_report,recall_score


## Load Dataset

In [142]:
#Load data
data = pd.read_csv('../data/processed_text.csv')
data.head()

Unnamed: 0,id,cuisine,ingredients,ingredient_text,ingredient_clean_text
0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom...",romaine lettuce black olives grape tomatoes ga...,romaine lettuce black olive grape tomato garli...
1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma...",plain flour ground pepper salt tomatoes ground...,plain flour ground pepper salt tomato ground b...
2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...",eggs pepper salt mayonaise cooking oil green c...,egg pepper salt mayonaise cook oil green chili...
3,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']",water vegetable oil wheat salt,water vegetable oil wheat salt
4,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay...",black pepper shallots cornflour cayenne pepper...,black pepper shallot cornflour cayenne pepper ...


## Features and Target

In [210]:

X = data['ingredient_clean_text']

cuisines = ['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
            'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
            'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
            'irish', 'korean', 'moroccan', 'russian']


cuisine_mapping = {cuisine: i for i, cuisine in enumerate(cuisines)}

print(cuisine_mapping)
y = data['cuisine'].map(cuisine_mapping)

{'greek': 0, 'southern_us': 1, 'filipino': 2, 'indian': 3, 'jamaican': 4, 'spanish': 5, 'italian': 6, 'mexican': 7, 'chinese': 8, 'british': 9, 'thai': 10, 'vietnamese': 11, 'cajun_creole': 12, 'brazilian': 13, 'french': 14, 'japanese': 15, 'irish': 16, 'korean': 17, 'moroccan': 18, 'russian': 19}


## Baseline model

In [206]:

y.value_counts(normalize=True)

cuisine
6     0.197063
7     0.161865
1     0.108614
3     0.075502
8     0.067205
14    0.066526
12    0.038870
10    0.038694
15    0.035777
0     0.029542
5     0.024865
17    0.020868
11    0.020742
18    0.020642
9     0.020214
2     0.018982
16    0.016770
4     0.013225
19    0.012294
13    0.011741
Name: proportion, dtype: float64

In [148]:
#Train-test Split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((29830,), (29830,), (9944,), (9944,))

## Logistic Regression with TF-IDF Vectorization

In [150]:
logr_tfidf_pipeline =Pipeline(
    [
        ('tf-idf',TfidfVectorizer()),
        ('logr',LogisticRegression(random_state=42,max_iter=1000))
    ]
    
)


logr_tfidf_pipeline.fit(X_train, y_train)

# Predict on the test data
predictions = logr_tfidf_pipeline.predict(X_test)

# Predict on the test data
predictions = logr_tfidf_pipeline.predict(X_test)

# Calculate accuracy on training and test set
logr_train_accuracy = logr_tfidf_pipeline.score(X_train, y_train)
logr_test_accuracy = logr_tfidf_pipeline.score(X_test, y_test)

# Calculate misclassification rates
logr_train_misclassification_rate = 1 - logr_train_accuracy
logr_test_misclassification_rate = 1 - logr_test_accuracy

print(f'Training Accuracy: {logr_train_accuracy:.2f}')
print(f'Testing Accuracy: {logr_test_accuracy:.2f}')
print(f'Training Misclassification Rate: {logr_train_misclassification_rate:.2f}')
print(f'Testing Misclassification Rate: {logr_test_misclassification_rate:.2f}')
# Calculate recall and F1 scores
logr_recall = recall_score(y_test, predictions, average='weighted')
logr_f1 = f1_score(y_test, predictions, average='weighted')

print(f'Recall (Weighted Average): {logr_recall:.2f}')
print(f'F1 Score (Weighted Average): {logr_f1:.2f}')

# Display classification report for detailed per-class metrics
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Training Accuracy: 0.82
Testing Accuracy: 0.78
Training Misclassification Rate: 0.18
Testing Misclassification Rate: 0.22
Recall (Weighted Average): 0.78
F1 Score (Weighted Average): 0.77

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.66      0.71       294
           1       0.69      0.82      0.75      1080
           2       0.72      0.54      0.62       189
           3       0.86      0.90      0.88       751
           4       0.90      0.66      0.76       131
           5       0.67      0.44      0.53       247
           6       0.78      0.90      0.84      1960
           7       0.90      0.92      0.91      1610
           8       0.78      0.86      0.82       668
           9       0.64      0.41      0.50       201
          10       0.78      0.78      0.78       385
          11       0.77      0.54      0.64       206
          12       0.76      0.70      0.73       386
          13       0.74      0.

## Logistic Regression with Count Vectorization

In [152]:
logr_cvec_pipeline =Pipeline(
    [
        ('cvec',CountVectorizer()),
        ('logr',LogisticRegression(random_state=42,max_iter=1000))
    ]
    
)


logr_cvec_pipeline.fit(X_train, y_train)

# Predict on the test data
predictions = logr_cvec_pipeline.predict(X_test)

# Calculate accuracy on training and test set
logr_train_accuracy = logr_cvec_pipeline.score(X_train, y_train)
logr_test_accuracy = logr_cvec_pipeline.score(X_test, y_test)

# Calculate misclassification rates
logr_train_misclassification_rate = 1 - logr_train_accuracy
logr_test_misclassification_rate = 1 - logr_test_accuracy

print(f'Training Accuracy: {logr_train_accuracy:.2f}')
print(f'Testing Accuracy: {logr_test_accuracy:.2f}')
print(f'Training Misclassification Rate: {logr_train_misclassification_rate:.2f}')
print(f'Testing Misclassification Rate: {logr_test_misclassification_rate:.2f}')
# Calculate recall and F1 scores
logr_recall = recall_score(y_test, predictions, average='weighted')
logr_f1 = f1_score(y_test, predictions, average='weighted')

print(f'Recall (Weighted Average): {logr_recall:.2f}')
print(f'F1 Score (Weighted Average): {logr_f1:.2f}')

# Display classification report for detailed per-class metrics
print("\nClassification Report:")
print(classification_report(y_test, predictions))


Training Accuracy: 0.86
Testing Accuracy: 0.78
Training Misclassification Rate: 0.14
Testing Misclassification Rate: 0.22
Recall (Weighted Average): 0.78
F1 Score (Weighted Average): 0.78

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.71      0.74       294
           1       0.72      0.80      0.76      1080
           2       0.66      0.59      0.62       189
           3       0.85      0.88      0.87       751
           4       0.82      0.69      0.75       131
           5       0.63      0.48      0.55       247
           6       0.81      0.88      0.84      1960
           7       0.90      0.92      0.91      1610
           8       0.81      0.83      0.82       668
           9       0.55      0.48      0.52       201
          10       0.77      0.78      0.77       385
          11       0.68      0.56      0.62       206
          12       0.75      0.68      0.71       386
          13       0.68      0.

## Grid Search Logistic Regression with Count Vectorization

In [162]:
##Model 
#Logistic Regression
log_pipeline = Pipeline(
    [
        ('cvec',CountVectorizer()),
        ('logr',LogisticRegression(random_state=42,max_iter=1000))
    ]
    
)
params_list ={

    'cvec__ngram_range': [(1, 1), (1, 2)],  # Unigrams and bigrams
    'cvec__max_df': [ 0.85, 1.0],      # Ignore very frequent words
    'cvec__min_df': [1, 2],              # Ignore very infrequent words
    'cvec__max_features': [None, 5000],  # Limit on the number of features

    'logr__C': [0.01, 0.1, 1.0],        # Regularization strength
    'logr__penalty': ['l2'],                # L2 regularization (Ridge)
    'logr__solver': ['lbfgs']  # Solvers suitable for small datase
}
gs_logr = GridSearchCV(log_pipeline,param_grid=params_list,n_jobs=-1)
gs_logr.fit(X_train,y_train)
print(f' Best parameters:{gs_logr.best_params_}')
print(f' Best score:{gs_logr.best_score_}')



 Best parameters:{'cvec__max_df': 0.85, 'cvec__max_features': None, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 1), 'logr__C': 1.0, 'logr__penalty': 'l2', 'logr__solver': 'lbfgs'}
 Best score:0.7745893395910157


In [164]:
# Predict on the test data
predictions = gs_logr.predict(X_test)

# Predict on the test data
predictions = gs_logr.predict(X_test)

# Calculate accuracy on training and test set
logr_train_accuracy = gs_logr.score(X_train, y_train)
logr_test_accuracy = gs_logr.score(X_test, y_test)

# Calculate misclassification rates
logr_train_misclassification_rate = 1 - logr_train_accuracy
logr_test_misclassification_rate = 1 - logr_test_accuracy

print(f'Training Accuracy: {logr_train_accuracy:.2f}')
print(f'Testing Accuracy: {logr_test_accuracy:.2f}')
print(f'Training Misclassification Rate: {logr_train_misclassification_rate:.2f}')
print(f'Testing Misclassification Rate: {logr_test_misclassification_rate:.2f}')
# Calculate recall and F1 scores
logr_recall = recall_score(y_test, predictions, average='weighted')
logr_f1 = f1_score(y_test, predictions, average='weighted')

print(f'Recall (Weighted Average): {logr_recall:.2f}')
print(f'F1 Score (Weighted Average): {logr_f1:.2f}')

# Display classification report for detailed per-class metrics
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Training Accuracy: 0.86
Testing Accuracy: 0.78
Training Misclassification Rate: 0.14
Testing Misclassification Rate: 0.22
Recall (Weighted Average): 0.78
F1 Score (Weighted Average): 0.78

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.70      0.73       294
           1       0.72      0.80      0.76      1080
           2       0.65      0.58      0.62       189
           3       0.85      0.88      0.86       751
           4       0.82      0.69      0.75       131
           5       0.63      0.48      0.55       247
           6       0.81      0.88      0.84      1960
           7       0.90      0.92      0.91      1610
           8       0.81      0.82      0.82       668
           9       0.56      0.48      0.51       201
          10       0.77      0.78      0.77       385
          11       0.68      0.57      0.62       206
          12       0.75      0.68      0.71       386
          13       0.69      0.

## XBBoost with TF-IDF Vectorization

In [154]:
#XG_Boost

xbg_pipeline =Pipeline(
    [
        ('tf-idf',TfidfVectorizer()),
        ('xgb',XGBClassifier(n_estimators = 100,
                   max_depth = 1))
    ]
    
)

xbg_pipeline.fit(X_train.values, y_train.values)

print(xbg_pipeline.score(X_train, y_train))
print(xbg_pipeline.score(X_test, y_test))

0.7366074421723098
0.7201327433628318


In [156]:
# Predict on the test data
predictions = xbg_pipeline.predict(X_test)

# Predict on the test data
predictions = xbg_pipeline.predict(X_test)

# Calculate accuracy on training and test set
logr_train_accuracy = xbg_pipeline.score(X_train, y_train)
logr_test_accuracy = xbg_pipeline.score(X_test, y_test)

# Calculate misclassification rates
logr_train_misclassification_rate = 1 - logr_train_accuracy
logr_test_misclassification_rate = 1 - logr_test_accuracy

print(f'Training Accuracy: {logr_train_accuracy:.2f}')
print(f'Testing Accuracy: {logr_test_accuracy:.2f}')
print(f'Training Misclassification Rate: {logr_train_misclassification_rate:.2f}')
print(f'Testing Misclassification Rate: {logr_test_misclassification_rate:.2f}')
# Calculate recall and F1 scores
logr_recall = recall_score(y_test, predictions, average='weighted')
logr_f1 = f1_score(y_test, predictions, average='weighted')

print(f'Recall (Weighted Average): {logr_recall:.2f}')
print(f'F1 Score (Weighted Average): {logr_f1:.2f}')

# Display classification report for detailed per-class metrics
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Training Accuracy: 0.74
Testing Accuracy: 0.72
Training Misclassification Rate: 0.26
Testing Misclassification Rate: 0.28
Recall (Weighted Average): 0.72
F1 Score (Weighted Average): 0.71

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.60      0.67       294
           1       0.57      0.76      0.65      1080
           2       0.67      0.35      0.46       189
           3       0.85      0.85      0.85       751
           4       0.84      0.62      0.71       131
           5       0.76      0.30      0.43       247
           6       0.66      0.88      0.75      1960
           7       0.87      0.89      0.88      1610
           8       0.75      0.82      0.78       668
           9       0.65      0.17      0.27       201
          10       0.75      0.69      0.72       385
          11       0.63      0.49      0.55       206
          12       0.80      0.68      0.73       386
          13       0.67      0.

## Grid Search XBBoost with TF-IDF Vectorization

In [166]:

# Define the parameter grid
param_grid = {
    'tf-idf__max_df': [0.5, 0.75],
    'tf-idf__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'xgb__n_estimators': [50, 100],
    'xgb__max_depth': [1, 3]
}


# Setup the grid search
grid_search = GridSearchCV(xbg_pipeline,param_grid= param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model and parameters
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Predict and evaluate the model
predictions = grid_search.predict(X_test)


Best parameters: {'tf-idf__max_df': 0.75, 'tf-idf__ngram_range': (1, 2), 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
Best cross-validation score: 0.76


In [172]:
# Predict on the test data
predictions = grid_search.predict(X_test)

# Predict on the test data
predictions = grid_search.predict(X_test)

# Calculate accuracy on training and test set
logr_train_accuracy = grid_search.score(X_train, y_train)
logr_test_accuracy = grid_search.score(X_test, y_test)

# Calculate misclassification rates
logr_train_misclassification_rate = 1 - logr_train_accuracy
logr_test_misclassification_rate = 1 - logr_test_accuracy

print(f'Training Accuracy: {logr_train_accuracy:.2f}')
print(f'Testing Accuracy: {logr_test_accuracy:.2f}')
print(f'Training Misclassification Rate: {logr_train_misclassification_rate:.2f}')
print(f'Testing Misclassification Rate: {logr_test_misclassification_rate:.2f}')
# Calculate recall and F1 scores
logr_recall = recall_score(y_test, predictions, average='weighted')
logr_f1 = f1_score(y_test, predictions, average='weighted')

print(f'Recall (Weighted Average): {logr_recall:.2f}')
print(f'F1 Score (Weighted Average): {logr_f1:.2f}')

# Display classification report for detailed per-class metrics
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Training Accuracy: 0.86
Testing Accuracy: 0.77
Training Misclassification Rate: 0.14
Testing Misclassification Rate: 0.23
Recall (Weighted Average): 0.77
F1 Score (Weighted Average): 0.76

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.68      0.73       294
           1       0.66      0.80      0.72      1080
           2       0.69      0.47      0.56       189
           3       0.86      0.90      0.88       751
           4       0.88      0.66      0.75       131
           5       0.72      0.43      0.54       247
           6       0.75      0.89      0.81      1960
           7       0.89      0.91      0.90      1610
           8       0.79      0.83      0.81       668
           9       0.62      0.37      0.46       201
          10       0.79      0.78      0.78       385
          11       0.74      0.60      0.66       206
          12       0.80      0.70      0.75       386
          13       0.79      0.

## Predictions on Test Data

In [187]:
##Predictions on test set
test = pd.read_csv('../data/processed_text_test.csv')
test.head()
X_test = test['ingredient_clean_text'] 
predictions = logr_cvec_pipeline.predict(X_test)

#  DataFrame with the required format
submission = pd.DataFrame({
    'id': test['id'],      
    'cuisine': predictions       
})

# Save the DataFrame to a CSV file
submission.to_csv('../data/submission.csv', index=False)



In [190]:
predictions = grid_search.predict(X_test)

#  DataFrame with the required format
submission = pd.DataFrame({
    'id': test['id'],      
    'cuisine': predictions       
})

# Save the DataFrame to a CSV file
submission.to_csv('../data/submission.csv', index=False)
