 # 1. Importing Libraries

In [None]:
import os 
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import plotly.graph_objects as go


# 2.1 Data Preprocessing

In [None]:
anime_info = pd.read_csv('../input/anime-recommendation-database-2020/anime.csv')
anime_list = pd.read_csv('../input/anime-recommendation-database-2020/animelist.csv')
anime_synop = pd.read_csv('../input/anime-recommendation-database-2020/anime_with_synopsis.csv')

In [None]:
anime_info.shape,anime_list.shape,anime_synop.shape

### 2.1.1  Exploring anime_info

In [None]:
anime_info.head().T

In [None]:
anime_info.info()

In [None]:
anime_info.describe()

### 2.1.2   Exploring anime_list

In [None]:
anime_list.head()

In [None]:
anime_list.info()

In [None]:
anime_list.describe()

### 2.1.3   Exploring anime_synop

In [None]:
anime_synop.head()

# 2.2 Checking for NAN and Duplicate values

> **Function for printing null_values and related info**

In [None]:

def description(data):
    no_rows=data.shape[0]
    types=data.dtypes
    col_null = data.columns[data.isna().any()].to_list()
    counts=data.apply(lambda x: x.count())
    uniques=data.apply(lambda x: x.unique())
    nulls=data.apply(lambda x: x.isnull().sum())
    distincts=data.apply(lambda x: x.unique().shape[0])
    nan_percent=(data.isnull().sum()/no_rows)*100
    cols={'dtypes':types, 'counts':counts, 'distincts':distincts, 'nulls':nulls,  
          'missing_percent':nan_percent, 'uniques':uniques}
    table=pd.DataFrame(data=cols)
    return table

**1. Anime_info**

In [None]:
details_tr = description(anime_info)
details_tr.reset_index(level=[0],inplace =True)
details_tr.sort_values(by='missing_percent', ascending=False)

In [None]:
anime_info.duplicated().sum()

**2. Anime_list**

In [None]:
details_tr = description(anime_list)
details_tr.reset_index(level=[0],inplace =True)
details_tr.sort_values(by='missing_percent', ascending=False)

**3. Anime_synop**

In [None]:
details_tr = description(anime_synop)
details_tr.reset_index(level=[0],inplace =True)
details_tr.sort_values(by='missing_percent', ascending=False)

In [None]:
anime_synop.duplicated().sum()

#### As we can see from the above exploration ,only anime_synop has NAN values .We will remove them in next step

* So as we can see from above exploration ,Only Sypnopsis column has NAN Values so we will update there status as 'Unknown'

# 2.3 Removing NAN values

In [None]:
anime_synop['sypnopsis'].fillna('Unknown',inplace=True)
anime_info['Score'] = anime_info['Score'].replace('Unknown', 0).astype(float)

In [None]:
anime_synop.drop(['MAL_ID','Score','Genres'],axis = 1,inplace =True)
anime_info.rename(columns = {'MAL_ID':'anime_id'},inplace=True)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(anime_info.isnull())
plt.title("Missing values in anime?", fontsize = 15)
plt.show()

# 3 EDA and Visualization

* #### Inorder to build a recommendation engine, we have to understand our dataset

In [None]:
edacol = ['anime_id', 'Name','English name', 'Score', 'Genres', 'Type', 'Aired', 'Premiered','Rating','Source','Episodes','Dropped'] 
eda = anime_info[edacol]
eda.set_index('anime_id',inplace=True)


* #### Top 5 animes based on Score/Rating

In [None]:
anime_info.sort_values('Score',ascending=False).head(5)

In [None]:

labels = eda['Type'].value_counts().index
values = eda['Type'].value_counts().values
colors = ["ff9f1c","ffbf69","ffffff","cbf3f0","2ec4b6"]
fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values)],layout=go.Layout(height=600, width=800))
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))

fig.update_layout(
    title={
        'text': "Medium of Streaming",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

#### Insights:

* 28.4% of the anime's were aired on TV followed by 22.2% through OVA
* 17.3% of anime's are streamed as Movie which is greater than Special(12.6%) and ONA(10.9%)

In [None]:
labels = eda['Source'].value_counts().index
values = eda['Source'].value_counts().values
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values)],layout=go.Layout(height=600, width=800))
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))

fig.update_layout(
    title={
        'text': "Sources of Engagement",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

#### Insights: The above pie chart gives a count of sources of engagement of anime



* #### Top 10 Anime based on popularity in different source (eg Original,Manga,etc)

In [None]:
sour = anime_info.sort_values(by= 'Popularity').groupby('Source')[['Name','Popularity']]

for i in  eda['Source'].value_counts().index:
    bar = sour.get_group(i)
    bar.set_index('Name',inplace=True)
    bar.head(10).plot(kind='barh',legend =False,color = 'lightseagreen')
    plt.xlabel([i])
    plt.grid()
    
plt.show()

* #### Top 10 anime based on popularity in different Types

In [None]:
typ = anime_info.sort_values(by= 'Popularity').groupby('Type')[['Name','Popularity']]

for i in  eda['Type'].value_counts().index:
    bar = typ.get_group(i)
    bar.set_index('Name',inplace=True)
    bar.head(10).plot(kind='barh',legend =False,color = 'cornflowerblue')
    plt.xlabel([i])
    plt.grid()
    
plt.show()


* #### We are taking a sample of 50,000 user ratings as processing on the full dataset is memory consuming

In [None]:
df = anime_list.iloc[:18522589,:]
df.info()

In [None]:
anime_fulldata=pd.merge(anime_info,df,on='anime_id')

In [None]:
anime_ratingCount = (anime_fulldata.
     groupby(by = ['Name'])['rating'].count().reset_index()[['Name', 'rating']])

* #### I'm sensing the top anime's based on their rating counts provided by the user id's

In [None]:
top10_animerating=anime_ratingCount[['Name', 'rating']].sort_values(by = 'rating',ascending = False).head(10)
ax=sns.barplot(x="Name", y="rating", data=top10_animerating, palette="YlOrBr")
ax.set_xticklabels(ax.get_xticklabels(), fontsize=11, rotation=40, ha="right")
ax.set_title('Top 10 Anime based on rating counts',fontsize = 22)
ax.set_xlabel('Anime',fontsize = 20) 
ax.set_ylabel('User Rating count', fontsize = 20)

* #### Here are some animes which were dropped by many user's

In [None]:
drop = eda[['Name','Dropped']].sort_values(by = 'Dropped',ascending =False).head(10)
plt.figure(figsize=(10,4), dpi = 100)
sns.barplot(x=drop['Name'].unique(),y=drop['Dropped'].unique(), data=drop)
plt.xticks(rotation=40, ha = 'right')
plt.show()

* #### Here we have categorized animes according to Rating and displayed there count

In [None]:
labels = eda['Rating'].value_counts().index
values = eda['Rating'].value_counts().values
colors = ["007f5f","2b9348","55a630","80b918","aacc00","bfd200","d4d700","dddf00","eeef20","ffff3f"]
fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values)],layout=go.Layout(height=600, width=800))
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))

fig.update_layout(
    title={
        'text': "Rating based Anime",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

* #### Here we have listed Top 10 animes based on their No. of Episodes

In [None]:
eda['Episodes'] = eda['Episodes'].replace('Unknown', 0).astype(str).astype(int)
epi = eda[['Name','Episodes']].sort_values(by = 'Episodes',ascending =False).head(10)
plt.figure(figsize=(10,4), dpi = 100)
sns.barplot(x=epi['Name'].unique(),y=epi['Episodes'].unique(), data=epi)
plt.xticks(rotation=40, ha = 'right')
plt.show()

In [None]:
anime_info['Studios'] = anime_info['Studios'].replace('Unknown', 0)
stu = anime_info['Studios'].value_counts().reset_index()
stu.rename(columns={"index": "Studio", "Studios": "No of animes"})
stu = stu.drop(0).head(10)

*  #### Here we have categorized animes based on their Production Studio and displayed there count

In [None]:
labels = stu['index'].unique()
values = stu['Studios'].unique()
colors = ['f72585','b5179e','7209b7','560bad','480ca8','3a0ca3','3f37c9','4361ee','4895ef','4cc9f0']
fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values)],layout=go.Layout(height=600, width=800))
fig.update_traces(hole = .4,hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))

fig.update_layout(
    title={
        'text': "Anime Production by Studio",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

* #### Here we have displayed the Genre Cloud of Animes

In [None]:
# nonull_anime=anime_fulldata.copy()
# nonull_anime.dropna(inplace=True)
# from collections import defaultdict

# all_genres = defaultdict(int)

# for genres in nonull_anime['Genres']:
#     for genre in genres.split(','):
#         all_genres[genre.strip()] += 1
        
# from wordcloud import WordCloud

# genres_cloud = WordCloud(width=800, height=400, background_color='white', colormap='gnuplot').generate_from_frequencies(all_genres)
# plt.imshow(genres_cloud, interpolation='bilinear')
# plt.axis('off')

#### As the above code consumes too much Memory ,So we have uploaded the image of the result by running it in a new notebook

In [None]:
import matplotlib.image as mpimg


img = mpimg.imread('../input/wordcloud-output/__results___29_1.png')
plt.figure(figsize= (15,10))
plt.imshow(img)


#### We can see there are many Comedy genre anime's followd by action, romance,drama in our dataset