In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

# Import Dataset

In [None]:
data = pd.read_csv('/kaggle/input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
data.describe()

In [None]:
data.head()

# Checking for Null Values

In [None]:
data.isnull().sum()

# Exploratory Data Analysis (EDA)

**Distribution of Classes**

In [None]:
sns.countplot('Genre',data=data)

**Correlation Matrix**

In [None]:
corr = data.corr()
corr

In [None]:
sns.heatmap(corr)

**Scatter Plot of Price and Reviews (For Fiction and Non-Fiction Books)**

In [None]:
for genre in ['Fiction','Non Fiction']:
    temp_fiction = data[data['Genre']==genre]
    sns.lmplot(x='Reviews',y='Price',data=temp_fiction, fit_reg=False)
    plt.title('{} Books'.format(genre))
    plt.xlabel('Reviews')
    plt.ylabel('Price')
    plt.show()

**Average Rating of BestSeller Books (For Fiction and Non-Fiction Books)**

In [None]:
temp = data[data['Genre']=='Fiction']
years =  sorted(temp['Year'].unique().tolist())
mean_rating = pd.DataFrame([], columns=['Year','Rating','Genre'])
for index, year in enumerate(years):
    
    fn_temp = data[(data['Genre']=='Fiction') & (data['Year'] == year)]
    fn_rating = fn_temp['User Rating'].mean()
    nfn_temp = data[(data['Genre'] == 'Non Fiction') & (data['Year'] == year)]
    nfn_rating = nfn_temp['User Rating'].mean()
    temp_fn = {'Year':year,'Rating':fn_rating,'Genre':'Fiction'}
    temp_nfn = {'Year':year, 'Rating':nfn_rating,'Genre':'Non_Fiction'}
    mean_rating = mean_rating.append(temp_fn, ignore_index = True)
    mean_rating = mean_rating.append(temp_nfn, ignore_index = True)
mean_rating

In [None]:
sns.catplot(x='Year', y='Rating', hue='Genre', data=mean_rating, kind='bar')
plt.xticks(rotation=90)
plt.title('Average Rating Per Year (Genre Wise)')


Top 10 Author with Books of Maximum Price

In [None]:
author_price = pd.DataFrame([],columns=['Author','Mean Price'])
authors = data['Author'].unique()
for index, author in enumerate(authors):
    author_data = data[data['Author']==author]
    mean_price = author_data['Price'].mean()
    author_price.loc[index] = [author,mean_price]
author_price =  author_price.sort_values('Mean Price', ascending=False).iloc[:10,:]
author_price

In [None]:
sns.barplot(x = 'Author', y ='Mean Price',data= author_price)
plt.xticks(rotation=90)
plt.title('Authors with Books of Maximum Price')

**Histogram of Reviews and Price**

In [None]:
for feature in ['Reviews','Price']:
    sns.distplot(data[feature])
    plt.title('Histogram of {}'.format(feature))
    plt.show()

**Top 10 Authors with Max BestSeller Books**

In [None]:
sns.barplot(x = data['Author'].value_counts().head(10).index,y = data['Author'].value_counts().head(10).values,data=data)
plt.xticks(rotation=90)
plt.title('Top 10 Authors with Max BestSeller Books')
plt.show()

# Genre Prediction

In [None]:
y = data.Genre

In [None]:
X = data.drop(columns=['Name','Author','Genre'])

In [None]:
def clean(x):
    x = re.sub('[^A-z\s]','',x)
    return x.lower()

In [None]:
import re
text = data.Name.apply(lambda x: clean(x)) 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500,max_df=0.6,min_df=1)
text_num = tfidfconverter.fit_transform(text).toarray()

In [None]:
x = np.concatenate((text_num,X),axis=1)
print('Shape of input features {}'.format(x.shape))

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [None]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
y = label_encoder.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size=0.2)

# Initializing Models

In [None]:
from sklearn.svm import SVC
svc_clf = SVC(C=1.0,gamma='auto')

In [None]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ab_clf = AdaBoostClassifier(n_estimators=200,learning_rate=0.1)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=200,learning_rate=0.1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()

In [None]:
from sklearn.linear_model import RidgeClassifier
rg_clf = RidgeClassifier(alpha=1.0, class_weight='balanced')


In [None]:
from sklearn.linear_model import RidgeClassifierCV
rgcv_clf = RidgeClassifierCV(cv=10)
# rgcv_clf.fit(xtrain,ytrain)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()
# clf.fit(xtrain,ytrain)

In [None]:
from catboost import CatBoostClassifier
cb_clf = CatBoostClassifier(learning_rate = 0.001,iterations = 2,verbose=False)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def metrics(ytest,ypred):
    accuracy = accuracy_score(ytest,ypred)
    harmonic_mean = f1_score(ytest,ypred)
    return accuracy, harmonic_mean

# Get Result of all Models

In [None]:
from sklearn.model_selection import cross_val_score
models = [svc_clf, lr_clf, ab_clf, gb_clf, dt_clf,rg_clf, rgcv_clf, gnb_clf,cb_clf] 
result = pd.DataFrame([],columns=['Model','Accuracy','F1 Score','CV_Accuracy'])


In [None]:
def get_model_result(model, model_id, cv_unit):
    model.fit(xtrain,ytrain)
    ypred = model.predict(xtest)
    accuracy, f1_score = metrics(ytest,ypred)
    cv_accuracy = cross_val_score(model, xtrain, ytrain, scoring='accuracy', cv=cv_unit)
    result.loc[model_id] = [str(model),accuracy, f1_score, cv_accuracy.mean()]
    

In [None]:
for model_id, model in enumerate(models):
    get_model_result(model, model_id, cv_unit=10)
    
result

# StackingCV Classifier

In [None]:
from mlxtend.classifier import StackingCVClassifier
stack_gen = StackingCVClassifier(classifiers=models,
                                meta_classifier=lr_clf,
                                use_features_in_secondary=True,cv=10)
stack_gen.fit(np.array(xtrain),np.array(ytrain))
ypred = stack_gen.predict(np.array(xtest))



In [None]:
accuracy, f1_score_value = metrics(ytest,ypred)
print('StackingCV Accuracy {} and F1 Score {}'.format(accuracy,f1_score_value))