In [None]:
!pip install keras
!pip install scikeras[tensorflow]
!conda install pytorch torchvision torchaudio cudatoolkit 10.2 -c pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikeras[tensorflow]
  Downloading scikeras-0.10.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.10.0
/bin/bash: conda: command not found


In [None]:
# Imports
import pandas as pd
import numpy as np
# from flair.models import TextClassifier
# from flair.data import Sentence

# Importing of various classification tools that were tested
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier


# Used classification tools
from sklearn.model_selection import cross_val_score

# To save Sentiment Analysis
import gzip
import pickle

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with gzip.open('/content/drive/MyDrive/12/CDs_and_Vinyl/train/training_data_with_sentiments.json', 'rb') as f:
    training_data = pickle.load(f)
scores = pd.read_json('/content/drive/MyDrive/12/CDs_and_Vinyl/train/product_training.json')

In [None]:
training_data.to_pickle('/content/drive/MyDrive/12/CDs_and_Vinyl/train/training_data_uncompressed.pkl')


In [None]:
def create_feature_vector(training_data,scores):
    # Preprocessing & Feature Generation
    training_data["format"] = training_data["style"].apply(lambda x: x["Format:"] if pd.notna(x) else None)
    # New Features
    training_data["review_length"] = training_data["reviewText"].apply(lambda text: len(text) if text != None else 0)
    training_data["vote"] = training_data["vote"].apply(lambda vote: 0 if pd.isna(vote) else int(vote.replace(',', '')) if isinstance(vote, str) else int(vote))
    # Reviewer Ratio Generation
    reviewers = training_data[['reviewerID', 'verified']].copy()
    reviewers['verification_ratio'] = reviewers['verified'].map(int)
    reviewers = reviewers.drop('verified', axis=1)
    reviewers = reviewers.groupby(['reviewerID']).mean()
    training_data = pd.merge(training_data, reviewers, on='reviewerID', how='left')

    summary_stats = pd.DataFrame()
    summary_stats['raw'] = training_data[['asin', 'summary_sentiment']].groupby('asin')['summary_sentiment'].apply(list)
    summary_stats['summary_sentiment_avg'] = summary_stats.raw.apply(lambda x: np.mean(x))
    summary_stats['summary_sentiment_std'] = summary_stats.raw.apply(lambda x: np.std(x))

    summary_stats['number_of_reviews'] = summary_stats.raw.apply(lambda list: pd.Series(list).count())

    text_stats = pd.DataFrame()
    text_stats['raw'] = training_data[['asin', 'text_sentiment']].groupby('asin')['text_sentiment'].apply(list)
    text_stats['text_sentiment_avg'] = text_stats.raw.apply(lambda x: np.mean(x))
    text_stats['text_sentiment_std'] = text_stats.raw.apply(lambda x: np.std(x))

    vote_weighted_stats = pd.DataFrame(training_data['asin'])
    vote_weighted_stats['vote_weighted_summary_sentiment'] = training_data['summary_sentiment'] * training_data['vote']
    vote_weighted_stats['vote_weighted_text_sentiment'] = training_data['text_sentiment'] * training_data['vote']
    vote_weighted_stats = vote_weighted_stats.groupby('asin').mean()

    verified_weighted_stats = pd.DataFrame(training_data['asin'])
    verified_weighted_stats['verified_text_sentiment'] = training_data['verified'] * training_data['text_sentiment']
    verified_weighted_stats['verified_summary_sentiment'] = training_data['verified'] * training_data['summary_sentiment']

    verified_weighted_stats['verified_vote_weighted_text_sentiment'] = training_data['verified'] * training_data['text_sentiment'] * training_data['vote']
    verified_weighted_stats['verified_vote_weighted_summary_sentiment'] = training_data['verified'] * training_data['summary_sentiment'] * training_data['vote']
    verified_weighted_stats = verified_weighted_stats.groupby('asin').mean()

    time_weighted_stats = pd.DataFrame(training_data['asin'])
    training_data['normedReviewTime'] = (training_data['unixReviewTime'] - training_data['unixReviewTime'].mean()) / training_data['unixReviewTime'].std()
    time_weighted_stats['avgReviewTime'] = training_data['normedReviewTime']

    product_format = training_data[['asin', 'format']]
    product_format.drop_duplicates(subset=['asin'])
    format_list = product_format.format.unique()
    formats = {}
    for i in range(len(format_list)):
        formats[format_list[i]] = i
    product_format.format = product_format.format.map(formats)
    product_format = product_format.drop_duplicates(subset='asin')

    aggregatedProductFeatures = pd.merge(summary_stats.drop('raw', axis=1), text_stats.drop('raw', axis=1), on='asin')
    aggregatedProductFeatures = aggregatedProductFeatures.merge(vote_weighted_stats, on='asin').merge(verified_weighted_stats, on='asin').merge(time_weighted_stats, on='asin')

    aggregatedProductFeatures = aggregatedProductFeatures.merge(product_format, on='asin')
    verified_review_ratio = training_data[['asin', 'verified']].groupby('asin').mean()
    aggregatedProductFeatures = aggregatedProductFeatures.merge(verified_review_ratio, on='asin')
    aggregatedProductFeatures['text_sentiment_std'] = aggregatedProductFeatures['text_sentiment_std'].apply(lambda std: 0 if pd.isna(std) else std)
    aggregatedProductFeatures['summary_sentiment_std'] = aggregatedProductFeatures['summary_sentiment_std'].apply(lambda std: 0 if pd.isna(std) else std)
    aggregatedProductFeatures = aggregatedProductFeatures.merge(scores, on='asin')
    return aggregatedProductFeatures

In [None]:
aggregatedProductFeatures = create_feature_vector(training_data,scores)
X = aggregatedProductFeatures.drop(['asin', 'awesomeness'], axis=1)
y = aggregatedProductFeatures['awesomeness']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_format.format = product_format.format.map(formats)


In [None]:
with gzip.open('/content/drive/MyDrive/12/CDs_and_Vinyl/train/data_with_SA.json', 'rb') as f:
    old_data = pickle.load(f)
old_features = create_feature_vector(old_data,scores)
old_X = old_features.drop(['asin', 'awesomeness'], axis=1)
old_y = old_features['awesomeness']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_format.format = product_format.format.map(formats)


In [None]:
with gzip.open('/content/drive/MyDrive/12/CDs_and_Vinyl/test3/test3_with_sentiments.json', 'rb') as f:
    test_3_data = pickle.load(f)
product_test_3 = pd.read_json('/content/drive/MyDrive/12/CDs_and_Vinyl/test3/product_test.json')

features = create_feature_vector(test_3_data , product_test_3)
test_3_X = features.drop(['asin'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_format.format = product_format.format.map(formats)


In [None]:
test_3_data.to_pickle('/content/drive/MyDrive/12/CDs_and_Vinyl/train/test_3_data.pkl')

In [None]:
# COMPARING FLAIR AND KALUDI- SLIDE 3 & 7
bg = BaggingClassifier(DecisionTreeClassifier(max_depth = 9, max_features = 0.7), max_samples = 0.25, max_features = 1.0, n_estimators = 100)

f1_scores = cross_val_score(bg, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)
print(f"Flair's: {np.mean(f1_scores)}")

f1_scores = cross_val_score(bg, old_X, old_y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)
print(f"Kaludi's: {np.mean(f1_scores)}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  2.3min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Flair's: 0.7479841291392126


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  2.3min remaining:  1.6min


Kaludi's: 0.6692727613234261


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.7min finished


In [None]:
# GRIDSEARCH OF KNN- SLIDE 4
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, f1_score

parameter_space = {
    'n_neighbors': [5, 100, 200],
    'weights': ['uniform', 'distance']
}
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameter_space, cv=3, scoring=make_scorer(f1_score), n_jobs=-1, verbose=3)
clf.fit(X, y)

# Best parameter set
print('Best parameters found:\n', clf.best_params_)

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters found:
 {'n_neighbors': 200, 'weights': 'distance'}
0.618 (+/-0.032) for {'n_neighbors': 5, 'weights': 'uniform'}
0.617 (+/-0.032) for {'n_neighbors': 5, 'weights': 'distance'}
0.642 (+/-0.031) for {'n_neighbors': 100, 'weights': 'uniform'}
0.643 (+/-0.030) for {'n_neighbors': 100, 'weights': 'distance'}
0.651 (+/-0.033) for {'n_neighbors': 200, 'weights': 'uniform'}
0.653 (+/-0.031) for {'n_neighbors': 200, 'weights': 'distance'}


In [None]:
# GRIDSEARCH OF DECISION TREE- SLIDE 4
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, make_scorer

param_distributions = {
    'max_depth': [5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [.1,.5,.7,1],
}
dt = DecisionTreeClassifier()
f1_scorer = make_scorer(f1_score)
random_search = RandomizedSearchCV(estimator=dt, 
                                   param_distributions=param_distributions,
                                   cv=3,
                                   n_iter=7, 
                                   scoring=f1_scorer,
                                   random_state=42)
random_search.fit(X, y)
best_params = random_search.best_params_
best_score = random_search.best_score_
print(f"Best parameters: {best_params}")
print(f"Best F1 score: {best_score}")


Best parameters: {'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 0.7, 'max_depth': 10}
Best F1 score: 0.7224957226555073


In [None]:
# GRIDSEARCH WITH NEURAL NETS- SLIDE 4
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
parameter_space = {
    'hidden_layer_sizes': [(32,32,32), (64,64,64), (128,128,128)],
    'activation': ['tanh', 'relu','logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.01, 0.1,1],
    'learning_rate': ['constant','adaptive'],
}

mlp = MLPClassifier(max_iter=100)
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2)
clf.fit(X_train, y_train)
print('Best parameters found:\n', clf.best_params_)

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters found:
 {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (128, 128, 128), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.601 (+/-0.075) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (32, 32, 32), 'learning_rate': 'constant', 'solver': 'sgd'}
0.762 (+/-0.013) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (32, 32, 32), 'learning_rate': 'constant', 'solver': 'adam'}
0.656 (+/-0.071) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (32, 32, 32), 'learning_rate': 'adaptive', 'solver': 'sgd'}
0.760 (+/-0.027) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (32, 32, 32), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.601 (+/-0.048) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (64, 64, 64), 'learning_rate': 'constant', 'solver': 'sgd'}
0.809 (+/-0.038) for {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (64, 64, 64), 'learning_rate': 'constant', 'solver': 'adam'}
0.6



In [None]:
# BASIC LOGISTIC REGRESSION WITH FLAIR- SLIDE 5
clf = LogisticRegression(C = 4.28)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print(f1_scores)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


[0.73456299 0.73405119 0.73854831 0.73911818 0.77828765 0.73807748
 0.72556275 0.74015515 0.73662935 0.73439033]
0.7399383372123085


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.1min finished


In [None]:
# BASIC RANDOMFOREST WITH FLAIR- SLIDE 5
clf = RandomForestClassifier(max_depth = 9, max_features = 0.7, n_estimators = 185, max_samples = 0.8)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print(f1_scores)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  8.7min remaining:  5.8min


[0.72940984 0.73545922 0.73317762 0.76208795 0.79089447 0.76001864
 0.73992483 0.72680963 0.7430821  0.75059279]
0.7471457079498535


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 14.0min finished


In [None]:
# BASIC SVM WITH FLAIR- SLIDE 5
clf = SVC(kernel = 'rbf', C = 0.1, gamma = 1)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print(f1_scores)
print(np.mean(f1_scores))
####################################################
# *Compute time took too long*


In [None]:
# BASIC SVM WITH FLAIR- SLIDE 5
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(weights = 'distance', n_neighbors=200)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   56.9s remaining:   37.9s


0.6567748187596336


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.4min finished


In [None]:
# BASIC NEURAL NETS WITH FLAIR- SLIDE 5
best_params = {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (128, 128, 128), 'learning_rate': 'adaptive', 'solver': 'adam'}
mlp_best = MLPClassifier(**best_params, max_iter=100)
f1_scores = cross_val_score(mlp_best, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print("Mean cross-validation score: ", f1_scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 28.0min remaining: 18.7min


Mean cross-validation score:  0.7038565712408165


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 38.6min finished


In [None]:
# BASIC DECISION TREE WITH FLAIR- SLIDE 5
clf = DecisionTreeClassifier(max_depth = 10, max_features = .7, min_samples_leaf = 5, min_samples_split = 10)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    6.3s remaining:    4.2s


0.721651415685223


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    8.8s finished


In [None]:
# LOGISTIC REGRESSION WITH FLAIR + BAGGING- SLIDE 6
clf = BaggingClassifier(LogisticRegression(C = 4.28), max_samples = 0.25, max_features = 1.0, n_estimators = 100)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print(f1_scores)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 16.7min remaining: 11.2min


[0.73454121 0.73452029 0.73535395 0.7389692  0.7788288  0.75049516
 0.73746492 0.73667733 0.72786726 0.72994232]
0.7404660441287559


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 27.2min finished


In [None]:
# RANDOMFOREST WITH FLAIR + BAGGING- SLIDE 6
clf = BaggingClassifier(RandomForestClassifier(max_depth = 9, max_features = 0.7, n_estimators = 185, max_samples = 0.8), max_samples = 0.25, max_features = 1.0, n_estimators = 100)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print(f1_scores)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 158.1min remaining: 105.4min


[0.80309096 0.71913954 0.69558421 0.7174121  0.71685454 0.70859308
 0.69712593 0.70072274 0.68984812 0.68916298]
0.7137534203787574


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 256.0min finished


In [None]:
# KNN WITH FLAIR + BAGGING- SLIDE 6
from sklearn.neighbors import KNeighborsClassifier

clf = BaggingClassifier(KNeighborsClassifier(n_neighbors=200),  max_samples = 0.25, max_features = 1.0, n_estimators = 100)

f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print(f1_scores)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


[0.65339982 0.67143514 0.66473    0.67150934 0.72308234 0.64835635
 0.66162112 0.65729028 0.66185767 0.6505248 ]
0.6663806850891638


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 63.3min finished


In [None]:
# NEURAL NETS WITH FLAIR + BAGGING- SLIDE 6
best_params = {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (128, 128, 128), 'learning_rate': 'adaptive', 'solver': 'adam'}
clf = BaggingClassifier(MLPClassifier(**best_params, max_iter=100),   max_samples = 0.25, max_features = 1.0, n_estimators = 100)
# Perform cross-validation with 10 folds
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print("Mean cross-validation score: ", f1_scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
# DECISION TREE WITH FLAIR + BAGGING- SLIDE 6
clf = BaggingClassifier(DecisionTreeClassifier(max_depth = 10, max_features = .7, min_samples_leaf = 5, min_samples_split = 10), max_samples = 0.25, max_features = 1.0, n_estimators = 100)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  2.5min remaining:  1.7min


0.7529036132822367


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.0min finished


In [None]:
# LOGISTIC REGRESSION WITH FLAIR + BOOSTING- SLIDE 7
clf = AdaBoostClassifier(base_estimator = LogisticRegression(C = 4.28),n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print(f1_scores)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  2.2min remaining:  1.4min


[0.72595252 0.73128639 0.73229809 0.73380815 0.77445968 0.73740273
 0.72254964 0.73630778 0.7270296  0.71018757]
0.7331282166853124


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.2min finished


In [None]:
# RANDOM FOREST WITH FLAIR + BOOSTING- SLIDE 7
clf = AdaBoostClassifier(RandomForestClassifier(max_depth = 9, max_features = 0.7, n_estimators = 185, max_samples = 0.8),n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print(f1_scores)
print(np.mean(f1_scores))

In [None]:
# NEURAL NETS WITH FLAIR + BOOSTING- SLIDE 7
best_params = {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (128, 128, 128), 'learning_rate': 'adaptive', 'solver': 'adam'}
clf = AdaBoostClassifier(MLPClassifier(**best_params, max_iter=100),n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)

print("Mean cross-validation score: ", f1_scores.mean())

In [None]:
# DECISION TREE WITH FLAIR + BOOSTING- SLIDE 7
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 10, max_features = .7, min_samples_leaf = 5, min_samples_split = 10),n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
f1_scores = cross_val_score(clf, X, y, cv=10, scoring="f1", n_jobs=-1, verbose = 1)
print(np.mean(f1_scores))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  4.4min remaining:  2.9min


0.684876835462896


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.9min finished


In [None]:
# Fitting the Best Performing Model
best_clf = BaggingClassifier(DecisionTreeClassifier(max_depth = 10, max_features = .7, min_samples_leaf = 5, min_samples_split = 10), max_samples = 0.25, max_features = 1.0, n_estimators = 100)

best_clf.fit(X,y)

# Assume you have a model called 'model'
with open('/content/drive/MyDrive/12/CDs_and_Vinyl/train/best_clf.json', 'wb') as f:
    pickle.dump(best_clf, f)

In [None]:
with open('/content/drive/MyDrive/12/CDs_and_Vinyl/train/best_clf.json', 'rb') as f:
    best_clf = pickle.load(f)

In [None]:
features['review_predictions'] = best_clf.predict(test_3_X)

In [None]:
# Calculate mean review prediction per product and round it
predictions = features.groupby('asin')['review_predictions'].mean().round()

# Convert Series to DataFrame
predictions_df = predictions.reset_index()

# Rename columns
predictions_df.columns = ['asin', 'awesomeness']

# Convert 'awesomeness' to integer
predictions_df['awesomeness'] = predictions_df['awesomeness'].astype(int)

# Save to JSON
predictions_df.to_json('/content/drive/MyDrive/12/CDs_and_Vinyl/train/predictions.json', orient='records')