In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [3]:
df = pd.read_csv('best-selling-manga.csv')
df.head()

Unnamed: 0,Manga series,Author(s),Publisher,Demographic,No. of collected volumes,Serialized,Approximate sales in million(s),Average sales per volume in million(s)
0,One Piece,Eiichiro Oda,Shueisha,Shōnen,104,1997–present,516.6,4.97
1,Golgo 13,"Takao Saito, Saito Production",Shogakukan,Seinen,207,1968–present,300.0,1.45
2,Case Closed / Detective Conan,Gosho Aoyama,Shogakukan,Shōnen,102,1994–present,270.0,2.65
3,Dragon Ball,Akira Toriyama,Shueisha,Shōnen,42,1984–1995,260.0,6.19
4,Doraemon,Fujiko F. Fujio,Shogakukan,Children,45,1969–1996,250.0,4.71


In [4]:
df.isna().sum()

Manga series                              0
Author(s)                                 0
Publisher                                 0
Demographic                               0
No. of collected volumes                  0
Serialized                                0
Approximate sales in million(s)           0
Average sales per volume in million(s)    0
dtype: int64

In [5]:
df_main = df.copy()
df = df_main.iloc[:,[3,4,6,7]]
df.shape

(187, 4)

In [6]:
df.head()

Unnamed: 0,Demographic,No. of collected volumes,Approximate sales in million(s),Average sales per volume in million(s)
0,Shōnen,104,516.6,4.97
1,Seinen,207,300.0,1.45
2,Shōnen,102,270.0,2.65
3,Shōnen,42,260.0,6.19
4,Children,45,250.0,4.71


In [7]:
df['Demographic'].value_counts()

Shōnen                101
Seinen                 50
Shōjo                  21
Shōnen/Seinen           6
Children                4
Josei                   2
—                       1
Shōjo/Josei             1
Shōnen/shōjo/Josei      1
Name: Demographic, dtype: int64

In [8]:
df['Demographic'].nunique()

9

In [9]:
label = LabelEncoder()
label_values = label.fit_transform(df['Demographic'])

In [10]:
len(label_values)

187

In [11]:
label_values

array([5, 2, 5, 5, 0, 5, 5, 5, 5, 2, 5, 6, 5, 5, 5, 5, 6, 5, 2, 6, 8, 5,
       5, 5, 2, 5, 5, 5, 6, 5, 5, 5, 5, 3, 5, 5, 5, 2, 2, 5, 2, 2, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 3, 5, 5, 5, 2, 2, 5, 3, 2, 5,
       5, 5, 5, 2, 2, 3, 5, 5, 5, 1, 5, 5, 2, 2, 5, 3, 2, 5, 5, 2, 6, 3,
       3, 5, 2, 5, 5, 2, 6, 5, 5, 3, 5, 2, 2, 2, 5, 5, 5, 3, 5, 2, 2, 2,
       3, 1, 0, 5, 5, 4, 3, 5, 5, 2, 2, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 2,
       5, 5, 2, 3, 2, 2, 7, 5, 5, 5, 2, 2, 5, 5, 0, 3, 2, 3, 5, 2, 2, 3,
       2, 5, 5, 2, 5, 0, 5, 2, 5, 2, 5, 2, 5, 5, 3, 2, 5, 5, 5, 2, 5, 2,
       2, 5, 2, 5, 3, 2, 3, 3, 3, 2, 2])

In [12]:
df['Demographic'] = label_values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Demographic'] = label_values


In [13]:
df.head()

Unnamed: 0,Demographic,No. of collected volumes,Approximate sales in million(s),Average sales per volume in million(s)
0,5,104,516.6,4.97
1,2,207,300.0,1.45
2,5,102,270.0,2.65
3,5,42,260.0,6.19
4,0,45,250.0,4.71


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 4 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Demographic                             187 non-null    int32  
 1   No. of collected volumes                187 non-null    int64  
 2   Approximate sales in million(s)         187 non-null    float64
 3   Average sales per volume in million(s)  187 non-null    float64
dtypes: float64(2), int32(1), int64(1)
memory usage: 5.2 KB


In [16]:
X = df.drop('Approximate sales in million(s)',axis = 1)
y = df['Approximate sales in million(s)']

X.shape

(187, 3)

In [17]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=100)
x_train.shape

(149, 3)

In [18]:
x_test.shape

(38, 3)

In [19]:
linear_model = LinearRegression()
linear_model.fit(x_train,y_train)
y_pred = linear_model.predict(x_test)

r2_score(y_test,y_pred)

0.8082029557745214

In [20]:
random_model = RandomForestRegressor()
random_model.fit(x_train,y_train)
y_pred = random_model.predict(x_test)

r2_score(y_test,y_pred)

0.5753493552326294

In [21]:
decision_model = DecisionTreeRegressor()
decision_model.fit(x_train,y_train)
y_pred = decision_model.predict(x_test)

r2_score(y_test,y_pred)

0.6315606680368562

In [22]:
support_model = SVR()
support_model.fit(x_train,y_train)
y_pred = support_model.predict(x_test)

support_model.score(x_test,y_test)

-0.007227516741646234

In [23]:
model = LinearRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

In [24]:
mean_absolute_error(y_test,y_pred)

14.18589812530285

In [25]:
df.describe()

Unnamed: 0,Demographic,No. of collected volumes,Approximate sales in million(s),Average sales per volume in million(s)
count,187.0,187.0,187.0,187.0
mean,3.877005,46.048128,50.764332,1.37738
std,1.555723,34.678288,57.178028,1.240013
min,0.0,5.0,20.0,0.19
25%,2.0,23.5,24.5,0.67
50%,5.0,34.0,31.0,1.03
75%,5.0,56.5,50.5,1.65
max,8.0,207.0,516.6,10.0


In [26]:
df.corr()

Unnamed: 0,Demographic,No. of collected volumes,Approximate sales in million(s),Average sales per volume in million(s)
Demographic,1.0,-0.056593,0.135103,0.175298
No. of collected volumes,-0.056593,1.0,0.385699,-0.300739
Approximate sales in million(s),0.135103,0.385699,1.0,0.52341
Average sales per volume in million(s),0.175298,-0.300739,0.52341,1.0


In [27]:
model.score(x_test,y_test)

0.8082029557745214

In [28]:
r2_score(y_test,y_pred)

0.8082029557745214

In [29]:
#Therefore, Linear Regression is the best model for the given dataset