In [1]:
import pandas as nd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
df = pd.read_csv('Books.csv')

In [4]:
df['PoD'] = df['PoD'].map({'Yes': 1, 'No': 0})

In [5]:
df['Subjects'] = df['Subjects'].str.split(',')

In [6]:
X = df.drop('PoD', axis=1)
y = df['PoD']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
categorical_features = ['Publisher', 'Format', 'International Edition', 'Ex-Library', 'Bought at']
numerical_features = ['Year']

In [9]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])


In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ])

In [11]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [12]:
clf.fit(X_train, y_train)

In [13]:
y_pred = clf.predict(X_test)

In [14]:
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         2

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

Accuracy: 1.0


In [19]:
feature_importance = clf.named_steps['classifier'].feature_importances_
feature_names = (preprocessor.named_transformers_['num'].get_feature_names_out().tolist() +
                 preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out().tolist())

feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\nTop 10 most important features:")
print(feature_importance_df.head(10))


Top 10 most important features:
              feature  importance
0                Year    0.304249
17        x0_Springer    0.140495
34      x4_Webuybooks    0.116265
20       x1_Paperback    0.043171
2        x0_Cambridge    0.041199
19       x1_Hardcover    0.040841
28     x4_Blackwell’s    0.039698
15  x0_Self-published    0.037767
13       x0_Princeton    0.034952
32           x4_Local    0.022476


In [20]:
mlb = MultiLabelBinarizer()
subjects_encoded = pd.DataFrame(mlb.fit_transform(df['Subjects']),
                                columns=mlb.classes_,
                                index=df.index)

In [21]:
print("\nTop 10 most common subjects:")
print(subjects_encoded.sum().sort_values(ascending=False).head(10))


Top 10 most common subjects:
Phil         27
M            18
 EMP         12
 Classics    11
 Prob         4
Econ          4
 AlgGeo       3
Pol           3
 NT           2
 Logic        2
dtype: int64
