In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
import collections

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/anime-recommendations-database/anime.csv")
df.head(10)

Deleting the "anime_id" column (as it's useless) and getting some information about the DataFrame:

In [None]:
df = df.drop(["anime_id"], axis = 1)
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(list(df.loc[df["rating"].isnull() == True].index), axis = 0)
df = df.fillna("Unknown")
df = df.reset_index(drop = True)
df.tail(10)

In [None]:
df.info()

As we can see, the number of episodes is not an integer type. It happens because there are some Animes that are not finished yet (the number of episodes are "Unknown"), so let's separate it in two DataFrames.

In [None]:
df_all = df.copy()
df_completed = df.drop(list(df.loc[df["episodes"] == "Unknown"].index), axis = 0).copy()
df_completed = df_completed.reset_index(drop = True)
print("Number of all Animes:", df_all.shape[0])
print("Number of completed Animes:", df_completed.shape[0])

And then, transform it into int64 data type.

In [None]:
df_completed["episodes"] = df_completed["episodes"].astype("int64")
df_completed.dtypes

In [None]:
df_all.describe().T

In [None]:
df_completed.describe().T

In [None]:
sns.countplot(data = df_all, x = "type")
plt.title("Number of each type of Anime")
plt.xlabel("Type")
plt.ylabel("Number")
plt.show()

In [None]:
df_pie = df_all["type"].value_counts()

plt.figure(figsize = (7, 7))
plt.pie(labels = df_pie.index, x = df_pie.values, autopct = "%0.2f%%", explode = [0.05, 0, 0, 0, 0, 0])
plt.show()

Most common genres among all types of Animes:

In [None]:
for item in list(df_all["genre"].index):
    df_all.loc[item, "genre"] = df_all.loc[item, "genre"] + ","
    
genre = []
for item in list(df_all["genre"].sum().split(",")):
    genre.append(item.strip())
    
counter = collections.Counter(genre)
genre_dict = dict(sorted(dict(counter).items(), key=lambda item: item[1], reverse = True))
del genre_dict[""]
df_genre = pd.Series(data = genre_dict)
df_genre

In [None]:
plt.figure(figsize = (8,6))
df_members = df_all.sort_values(by = "members", ascending = False).copy()
sns.barplot(data = df_members.iloc[0:10], y = "name", x = "members")
plt.title("Most popular Animes", size = 12)
plt.xlabel("Members")
plt.ylabel("")
plt.show()

In [None]:
plt.figure(figsize = (8,6))
df_episodes = df_completed.sort_values(by = "episodes", ascending = False).copy()
sns.barplot(data = df_episodes.iloc[0:10], x = "episodes", y = "name")
plt.title("Animes with more episodes (Completed Animes)", size = 12)
plt.xlabel("Episodes")
plt.ylabel("")
plt.show()

In [None]:
df_rating = df_all.sort_values(by = "rating", ascending = False).copy()
df_rating = df_rating.drop(list(df_rating.loc[df_rating["members"] < 1000].index), axis = 0)
df_rating = df_rating.drop(["genre", "episodes", "members"], axis = 1)
df_rating = df_rating.reset_index(drop = True)
df_rating.head(10)

In [None]:
plt.figure(figsize = (9,7))
sns.violinplot(data = df_all, x = "type", y = "rating")
plt.title("Relation between Rating and Type", size = 12)
plt.xlabel("Type")
plt.ylabel("Rating")
plt.show()

Now let's look at the most famous type of Anime: TV.

In [None]:
df_completed_tv = df_completed.loc[df_completed["type"] == "TV"].copy()
df_completed_tv = df_completed_tv.reset_index(drop = True)
df_completed_tv.head(10)

In [None]:
df_completed_tv.describe().T

Getting the correlation of each numeric information, to see how one impact in another.

In [None]:
plt.figure(figsize = (8,7))
sns.heatmap(data = df_completed_tv.corr(), annot = True)
plt.show()

With this, it's possible to see a strong correlation:

MORE MEMBERS = BETTER RATING (which makes sense, because if an anime has many members, it is possible to say that many people liked it).


In [None]:
plt.figure(figsize = (14, 6))
sns.lineplot(data = round(df_completed_tv.loc[df_completed_tv["rating"] >= 5.5, ["members", "rating"]]*10)/10, 
            x = "rating", y = "members")
plt.title("Relation between Members and Rating", size = 12)
plt.xlabel("Rating")
plt.ylabel("Members")
plt.show()