# Best Manga

In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [3]:
bestM_df = pd.read_csv(...\best-selling-manga.csv")

## Data Frame Exploration

### Shape, head and Data types

In [5]:
print("Shape of the data frame",bestM_df.shape)
bestM_df.head()

Shape of the data frame (187, 8)


Unnamed: 0,Manga series,Author(s),Publisher,Demographic,No. of collected volumes,Serialized,Approximate sales in million(s),Average sales per volume in million(s)
0,One Piece,Eiichiro Oda,Shueisha,Shōnen,104,1997–present,516.6,4.97
1,Golgo 13,"Takao Saito, Saito Production",Shogakukan,Seinen,207,1968–present,300.0,1.45
2,Case Closed / Detective Conan,Gosho Aoyama,Shogakukan,Shōnen,102,1994–present,270.0,2.65
3,Dragon Ball,Akira Toriyama,Shueisha,Shōnen,42,1984–1995,260.0,6.19
4,Doraemon,Fujiko F. Fujio,Shogakukan,Children,45,1969–1996,250.0,4.71


In [6]:
bestM_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Manga series                            187 non-null    object 
 1   Author(s)                               187 non-null    object 
 2   Publisher                               187 non-null    object 
 3   Demographic                             187 non-null    object 
 4   No. of collected volumes                187 non-null    int64  
 5   Serialized                              187 non-null    object 
 6   Approximate sales in million(s)         187 non-null    float64
 7   Average sales per volume in million(s)  187 non-null    float64
dtypes: float64(2), int64(1), object(5)
memory usage: 11.8+ KB


In [24]:
bestM_df.isna().sum()

Manga series                              0
Author(s)                                 0
Publisher                                 0
Demographic                               0
No. of collected volumes                  0
Serialized                                0
Approximate sales in million(s)           0
Average sales per volume in million(s)    0
dtype: int64

### Memory Usage

In [8]:
bestM_df.memory_usage(deep=True).sum()

80626

## Change Data Types

### float64 to float16

In [11]:
bestM_df['Average sales per volume in million(s)'] = bestM_df['Average sales per volume in million(s)'].astype('float16')
bestM_df['Approximate sales in million(s)'] = bestM_df['Approximate sales in million(s)'].astype('float16')

### int64 change

In [13]:
bestM_df['No. of collected volumes'].max()

207

Since the max value is 207 we can change it to int16

In [14]:
bestM_df['No. of collected volumes'] = bestM_df['No. of collected volumes'].astype('int16')

In [19]:
print('The max value',bestM_df['No. of collected volumes'].max(),'The min age',bestM_df['No. of collected volumes'].min())


The max value 207 The min age 5


### Memory usage check

In [15]:
bestM_df.memory_usage(deep=True).sum()

77260

Not significant but still it is a good practice

## Add New Columns

Adding few columns for better Exploration of the data

### Year start and Year End Columns

In [52]:
# bestM_df[Serialized_YearStart] = bestM_df['Serialized'].str.extract(r'(\d{4})') 
bestM_df['YearStart'] = bestM_df['Serialized'].str.extract(r'(\d{4})')
bestM_df['YearEnd'] = bestM_df['Serialized'].str.extract(r'(\d{4}(?=\s|$)|present)')

# Find and values as present and change to 2023
bestM_df.loc[bestM_df['YearEnd'] == 'present', 'YearEnd'] = 2023

# Change to int for Year start and year end
bestM_df[['YearStart','YearEnd']] = bestM_df[['YearStart','YearEnd']].astype('int16')



### Number of Authors

In [58]:
bestM_df['Author Count'] = (bestM_df['Author(s)'].str.count(',')+1).astype('int8')

### Final Memory Use Check and Data Frame Check

In [64]:
print("Memory Usage",bestM_df.memory_usage(deep=True).sum())
bestM_df.head()

Memory Usage 78195


Unnamed: 0,Manga series,Author(s),Publisher,Demographic,No. of collected volumes,Serialized,Approximate sales in million(s),Average sales per volume in million(s),YearStart,YearEnd,Author Count
0,One Piece,Eiichiro Oda,Shueisha,Shōnen,104,1997–present,4.96875,4.96875,1997,2023,1
1,Golgo 13,"Takao Saito, Saito Production",Shogakukan,Seinen,207,1968–present,1.450195,1.450195,1968,2023,2
2,Case Closed / Detective Conan,Gosho Aoyama,Shogakukan,Shōnen,102,1994–present,2.650391,2.650391,1994,2023,1
3,Dragon Ball,Akira Toriyama,Shueisha,Shōnen,42,1984–1995,6.191406,6.191406,1984,1995,1
4,Doraemon,Fujiko F. Fujio,Shogakukan,Children,45,1969–1996,4.710938,4.710938,1969,1996,1


## EDA
### Demographics Questions  
#### Number of unique Demographics or Genre

In [69]:
print("Count of unique demographics", bestM_df['Demographic'].nunique())

(
    bestM_df['Demographic']
    .unique()
)

Count of unique demographics 9


array(['Shōnen', 'Seinen', 'Children', 'Shōnen/Seinen', '—', 'Shōjo',
       'Josei', 'Shōjo/Josei', 'Shōnen/shōjo/Josei'], dtype=object)

#### Mange Series per demography

In [72]:
(
    bestM_df
    .groupby('Demographic')
    ['Manga series']
    .count()
)

Demographic
Children                4
Josei                   2
Seinen                 50
Shōjo                  21
Shōjo/Josei             1
Shōnen                101
Shōnen/Seinen           6
Shōnen/shōjo/Josei      1
—                       1
Name: Manga series, dtype: int64