<h3 style="background-color:orange;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Table Of Content</h3>


* [1. Import Libraries and Data Loading](#1)
* [2. Data Preprocessing](#2)
* [3. Feature Engineering](#3)
* [4. Exploratory Data Analysis](#4)
    * [4.1 Time Based Analysis](#4.1)
    * [4.2 Synopsis Based Analysis](#4.2)
* [5. Generating Synopses](#5)
    * [5.1 Training a LSTM Network](#5.1)
    * [5.2 Generated Synopses](#5.2)
    



<a id="1"></a>

<h3 style="background-color:orange;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Libraries And Utilities</h3>


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import string
import nltk
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
pyo.init_notebook_mode()
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('vader_lexicon')
from sklearn.cluster import KMeans
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from wordcloud import WordCloud,STOPWORDS
from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from nltk.util import ngrams
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import random
from keras.layers import Dense,LSTM,Input,Dropout,SimpleRNN
from keras import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from tqdm.notebook import tqdm
import gc
plt.rc('figure',figsize=(17,13))
sns.set_context('paper',font_scale=2)

In [None]:
anime_df = pd.read_csv('/kaggle/input/top-10000-anime-movies-ovas-and-tvshows/Anime_Top10000.csv')
anime_df.head(3)

<a id="2"></a>

<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Data Preprocessing</h3>


In [None]:

# Remove all the special characters
anime_df.Anime_Name	             = anime_df.Anime_Name.apply(lambda x: ''.join(re.sub(r'\W', ' ', x))) 
anime_df.Synopsis	             = anime_df.Synopsis.apply(lambda x: ''.join(re.sub(r'\W', ' ', x))) 

# Substituting multiple spaces with single space 
anime_df.Anime_Name              = anime_df.Anime_Name.apply(lambda x: ''.join(re.sub(r'\s+', ' ', x, flags=re.I)))
anime_df.Synopsis                = anime_df.Synopsis.apply(lambda x: ''.join(re.sub(r'\s+', ' ', x, flags=re.I)))

# Converting to Lowercase 
anime_df.Anime_Name              = anime_df.Anime_Name.str.lower() 
anime_df.Synopsis                = anime_df.Synopsis.str.lower() 

#Synopsis Sentiment Analysis
sid = SIA()
anime_df['sentiments']           = anime_df['Synopsis'].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',x.lower()))))
anime_df['Positive Sentiment']   = anime_df['sentiments'].apply(lambda x: x['pos']+1*(10**-6)) 
anime_df['Neutral Sentiment']    = anime_df['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
anime_df['Negative Sentiment']   = anime_df['sentiments'].apply(lambda x: x['neg']+1*(10**-6))

anime_df.drop(columns=['sentiments'],inplace=True)


<a id="3"></a>

<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Feature Engineering</h3>


In [None]:
#Only TV-Shows
tv_df                                                               = anime_df[anime_df.Anime_Episodes.str.contains('TV')].copy()

#Extract Number Of Episodes
tv_df['#_Episodes']                                                 = tv_df.Anime_Episodes.apply(lambda x: ''.join(re.findall(r'[0-9]*', x)))
tv_df['#_Episodes']                                                 = tv_df['#_Episodes'].replace('',np.nan)
tv_df.loc[tv_df['#_Episodes'].notna(),'#_Episodes']                 = tv_df[tv_df['#_Episodes'].notna()]['#_Episodes'].astype(np.int32)
#Extract Air Years
tv_df['Air_Years']                                                  = tv_df.Anime_Air_Years.apply(lambda x: ''.join(re.findall(r'[0-9 -]*', x)))

#Start Year
tv_df['Start_Year']                                                 = tv_df.Air_Years.apply(lambda x: x.split('-')[0].strip())
tv_df['Start_Year']                                                 = tv_df['Start_Year'].astype(np.int32)
#End Year
tv_df['End_Year']                                                   = tv_df.Air_Years.apply(lambda x: x.split('-')[1].strip() if len(x.split('-')[1])>4 else 'Still Airing')
tv_df.loc[(~tv_df['End_Year'].str.contains('Still')),'End_Year']    = tv_df[~tv_df['End_Year'].str.contains('Still')]['End_Year'].astype(np.int32)

tv_df

<a id="4"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Exploratory Data Analysis</h3>


In [None]:
plt.title('Distriubtion of Anime Ratings')
sns.histplot(tv_df.Anime_Rating,kde=True,stat='probability',palette = cm.coolwarm(tv_df.Anime_Rating))
plt.show()

**Observation**: When looking at the distribution of rating in our sample of anime tv-shows we see that distribution is approximately normal and centered around a particular mean, but an interesting point to note is the multimodality, we have two modes around the mean which may indicate two underlying groups, in the probabilistic inference section we will further explore the two modes.

In [None]:
fig = make_subplots(rows=2, cols=1,shared_xaxes=True,subplot_titles=('Perason Correaltion',  'Spearman Correaltion'))
colorscale=     [[1.0              , "rgb(165,0,38)"],
                [0.8888888888888888, "rgb(215,48,39)"],
                [0.7777777777777778, "rgb(244,109,67)"],
                [0.6666666666666666, "rgb(253,174,97)"],
                [0.5555555555555556, "rgb(254,224,144)"],
                [0.4444444444444444, "rgb(224,243,248)"],
                [0.3333333333333333, "rgb(171,217,233)"],
                [0.2222222222222222, "rgb(116,173,209)"],
                [0.1111111111111111, "rgb(69,117,180)"],
                [0.0               , "rgb(49,54,149)"]]

s_val =tv_df[['Anime_Rating','Start_Year','End_Year','#_Episodes','Positive Sentiment','Neutral Sentiment','Negative Sentiment']].corr('pearson')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val,name='pearson',showscale=False,xgap=1,ygap=1,colorscale=colorscale),
    row=1, col=1
)


s_val =tv_df[['Anime_Rating','Start_Year','End_Year','#_Episodes','Positive Sentiment','Neutral Sentiment','Negative Sentiment']].corr('spearman')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val,xgap=1,ygap=1,colorscale=colorscale),
    row=2, col=1
)

fig.update_layout(height=700, width=900, title_text="Locations That Contribute The Most To Our Cut-Offs")
fig.show()

**Observation**: unfortunately, we see no significant correlations between our numeric features in both Pearson and Spearman correlation metrics.

<a id="4.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Time Based Analysis</h3>


In [None]:
plt.title('Distriubtion Anime First Air Years')
ax = sns.barplot(x=tv_df.Start_Year.value_counts().sort_index().index,y=tv_df.Start_Year.value_counts().sort_index().values,palette=cm.coolwarm(tv_df.Start_Year.value_counts().sort_index().values))
plt.xticks(rotation=-45,fontsize=11)
plt.show()

**Observation**: Skimming the plot above, we can immediately observe that 2013-2019 were the years when the highest rating animes first aired.
Older animes may be rated lower in comparison to this interval, but it may be due to the trend of new animes being written and animated with the aim to the current viewer taste and trends where in the past, the opinion and taste of western countries had a lesser effect of anime writers as well as western countries not having a large industry of anime writes.


In [None]:
plt.figure(figsize=(25,6))
plt.title('Distriubtion Anime First Air Years')
ax = sns.barplot(x=tv_df['#_Episodes'].value_counts().sort_index().index,
                 y=tv_df['#_Episodes'].value_counts().sort_index().values,
                 palette=cm.coolwarm(tv_df['#_Episodes'].value_counts().sort_index().values))
plt.xticks(rotation=90,fontsize=10)
plt.margins(x=0)

plt.show()

**Observation**: In the plot above, we observe the distribution of the number of episodes in the anime TV show sample.
Interestingly 12,13,24,25,26 episodes are the most common amount of episodes for an anime TV show.

In [None]:
year_mean_df = tv_df.groupby(by='Start_Year').mean()

fig = go.Figure()
trace = go.Scatter(x=year_mean_df.index,y=year_mean_df.Anime_Rating,mode='lines+markers',name='Average Rating',line=dict(color='firebrick', width=4))
RA = year_mean_df.Anime_Rating.rolling(5).mean()
RS = year_mean_df.Anime_Rating.rolling(5).std()

running_average = go.Scatter(x=RA.index,y=RA.values,mode='lines',name='Running Average',line_color='blue')
running_average_postd = go.Scatter(x=RA.index,y=RA.values+RS.values,mode='lines',name='Running Average + 1 SD',line_color='green',line_dash='dot')
running_average_mostd = go.Scatter(x=RA.index,y=RA.values-RS.values,mode='lines',name='Running Average - 1 SD',line_color='green',line_dash='dot', fill='tonexty')

fig.add_trace(trace)
fig.add_trace(running_average)
fig.add_trace(running_average_postd)
fig.add_trace(running_average_mostd)

fig.update_layout(title='<b> Start Year Mean Anime Rating<b>',xaxis_title='<b>Year<b>',yaxis_title='<b>Average Rating<b>')
fig.update_layout(hovermode="x unified")

fig.show()

**Observation**: When looking at the average anime rating throughout our data frame's timeline, we see a rising average rating trend each year.
Another interesting observation is that the deviation in rating becomes narrower with time, meaning the population that rates the anime shows in our data becomes more consistent.



In [None]:
year_mean_df = tv_df.groupby(by='End_Year').mean()

fig = go.Figure()
trace = go.Scatter(x=year_mean_df.index,y=year_mean_df.Anime_Rating,mode='lines+markers',name='Average Rating',line=dict(color='firebrick', width=4))
RA = year_mean_df.Anime_Rating.rolling(5).mean()
RS = year_mean_df.Anime_Rating.rolling(5).std()

running_average = go.Scatter(x=RA.index,y=RA.values,mode='lines',name='Running Average',line_color='blue')
running_average_postd = go.Scatter(x=RA.index,y=RA.values+RS.values,mode='lines',name='Running Average + 1 SD',line_color='green',line_dash='dot')
running_average_mostd = go.Scatter(x=RA.index,y=RA.values-RS.values,mode='lines',name='Running Average - 1 SD',line_color='green',line_dash='dot', fill='tonexty')

fig.add_trace(trace)
fig.add_trace(running_average)
fig.add_trace(running_average_postd)
fig.add_trace(running_average_mostd)

fig.update_layout(title='<b> End Year Mean Anime Rating<b>',xaxis_title='<b>Year<b>',yaxis_title='<b>Average Rating<b>')
fig.update_layout(hovermode="x unified")

fig.show()

**Observation**: Similarly to the average first-year rating, we see a climbing trend in the average rating, but in comparison to the first year, the end-year has a much lower deviation through time, i.e., constant variance nonconstant mean.


In [None]:
b_date_mean = tv_df.groupby(by='Start_Year').mean().reset_index()
b_date_std = tv_df.groupby(by='Start_Year').std().reset_index()


fig = make_subplots(rows=2, cols=1,shared_xaxes=True,subplot_titles=('Yearly Average Positive Sentiment',  'Yearly Average Negative Sentiment'))

fig.add_trace(
    go.Scatter(x=b_date_mean['Start_Year'], y=b_date_mean['Positive Sentiment'],name='Positive Sentiment Mean'),
    row=1, col=1
)

    
#positive mean
fig.add_shape(type="line",
    x0=b_date_mean['Start_Year'].values[0], y0=b_date_mean['Positive Sentiment'].mean(), x1=b_date_mean['Start_Year'].values[-1], y1=b_date_mean['Positive Sentiment'].mean(),
    line=dict(
        color="Red",
        width=2,
        dash="dashdot",
    ),
        name='Mean'
)



fig.add_trace(
    go.Scatter(x=b_date_mean['Start_Year'], y=b_date_mean['Negative Sentiment'],name='Negative Sentiment Mean'),
    row=2, col=1
)

#negative mean
fig.add_shape(type="line",
    x0=b_date_mean['Start_Year'].values[0], y0=b_date_mean['Negative Sentiment'].mean(), x1=b_date_mean['Start_Year'].values[-1], y1=b_date_mean['Negative Sentiment'].mean(),
    line=dict(
        color="Red",
        width=2,
        dash="dashdot",
    ),
        name='Mean',
        xref='x2', 
        yref='y2'
)

fig['layout']['xaxis2']['title'] = 'Start_Year'
fig.update_layout(hovermode="x unified")
fig.update_layout(height=700, width=900, title_text="Sentiment Average Change With Time")
fig.show()

In [None]:
b_date_mean = tv_df.groupby(by='Start_Year').std().reset_index()


fig = make_subplots(rows=2, cols=1,shared_xaxes=True,subplot_titles=('Yearly Deviation in Positive Sentiment',  'Yearly Deviation in Negative Sentiment'))

fig.add_trace(
    go.Scatter(x=b_date_mean['Start_Year'], y=b_date_mean['Positive Sentiment'],name='Positive Sentiment SD'),
    row=1, col=1
)

    
#positive mean
fig.add_shape(type="line",
    x0=b_date_mean['Start_Year'].values[0], y0=b_date_mean['Positive Sentiment'].mean(), x1=b_date_mean['Start_Year'].values[-1], y1=b_date_mean['Positive Sentiment'].mean(),
    line=dict(
        color="Red",
        width=2,
        dash="dashdot",
    ),
        name='Mean'
)



fig.add_trace(
    go.Scatter(x=b_date_mean['Start_Year'], y=b_date_mean['Negative Sentiment'],name='Negative Sentiment SD'),
    row=2, col=1
)

#negative mean
fig.add_shape(type="line",
    x0=b_date_mean['Start_Year'].values[0], y0=b_date_mean['Negative Sentiment'].mean(), x1=b_date_mean['Start_Year'].values[-1], y1=b_date_mean['Negative Sentiment'].mean(),
    line=dict(
        color="Red",
        width=2,
        dash="dashdot",
    ),
        name='Mean',
        xref='x2', 
        yref='y2'
)

fig['layout']['xaxis2']['title'] = 'Start_Year'
fig.update_layout(hovermode="x unified")
fig.update_layout(height=700, width=900, title_text="Sentiment Average Change With Time")
fig.show()

<a id="4.2"></a>

<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Synopsis Based Analysis</h3>


In [None]:
f_data = tv_df.copy()
plt.subplot(2,1,1)
plt.title('Distriubtion Of Sentiments Across Lyrics',fontweight='bold')
sns.kdeplot(f_data['Negative Sentiment'],label='Negative Sentiment',lw=2)
sns.kdeplot(f_data['Neutral Sentiment'] ,label='Neutral Sentiment' ,color='orange' ,lw=2)
sns.kdeplot(f_data['Positive Sentiment'],label='Positive Sentiment',color='tab:red',lw=2)
plt.legend()
plt.subplot(2,1,2)
plt.title('CDF Of Sentiments Across Lyrics',fontweight='bold')
sns.kdeplot(f_data['Negative Sentiment'],cumulative=True,label='Negative Sentiment',lw=2)
sns.kdeplot(f_data['Neutral Sentiment'],cumulative=True,label='Neutral Sentiment' ,color='orange' ,lw=2)
sns.kdeplot(f_data['Positive Sentiment'],cumulative=True ,label='Positive Sentiment',color='tab:red',lw=2)
plt.xlabel('Sentiment Value')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
Rating_Based = pd.cut(tv_df.Anime_Rating,10,labels=range(0,10)).to_frame()
Rating_Based['Range'],bins = pd.cut(tv_df.Anime_Rating,10,retbins=True)
Rating_Based['Syn'] = tv_df.Synopsis

bins =[str(Rating_Based.query(f'Anime_Rating == {i}').Range.iloc[0]) for i in range(0,10)]
STP = list(STOPWORDS)
STP += ['written','mal','s','by','rewrite']
figure,axs = plt.subplots(2,5)
sample = 9

for row in axs:
    for ax in row:
        ax.set_title('Rating Range +'+bins[sample],fontsize=13)
        wc = WordCloud(background_color='white',width=200,height=200,stopwords=STP).generate(' '.join(Rating_Based.query(f'Anime_Rating == {sample}').Syn))
        ax.imshow(wc)
        sample-=1
        ax.axis('off')
plt.tight_layout()
plt.show()
#Rating_Based

In [None]:
NUMBER_OF_COMPONENTS = 500

CVZ = CountVectorizer()
SVD = TruncatedSVD(NUMBER_OF_COMPONENTS)

text_data = tv_df.Synopsis.copy()
text_data = text_data.apply(lambda x: ' '.join([word for word in x.split() if word not in STOPWORDS and len(word) > 1]).strip())

stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()

text_data = text_data.apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
text_data = text_data.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

C_vector = CVZ.fit_transform(text_data)


pc_matrix = SVD.fit_transform(C_vector)

evr = SVD.explained_variance_ratio_
total_var = evr.sum() * 100
cumsum_evr = np.cumsum(evr)

trace1 = {
    "name": "individual explained variance", 
    "type": "bar", 
    'y':evr}
trace2 = {
    "name": "cumulative explained variance", 
    "type": "scatter", 
     'y':cumsum_evr}
data = [trace1, trace2]
layout = {
    "xaxis": {"title": "Principal components"}, 
    "yaxis": {"title": "Explained variance ratio"},
  }
fig = go.Figure(data=data, layout=layout)
fig.update_layout(     title='{:.2f}% of the Anime Synopsis Variance Can Be Explained Using {} Words'.format(np.sum(evr)*100,NUMBER_OF_COMPONENTS))
fig.show()

In [None]:

best_fearures = [[CVZ.get_feature_names()[i],SVD.components_[0][i]] for i in SVD.components_[0].argsort()[::-1]]
worddf = pd.DataFrame(np.array(best_fearures[:500])[:,0]).rename(columns={0:'Word'})
worddf['Explained Variance'] =  np.round(evr*100,2)
worddf['Explained Variance'] =worddf['Explained Variance'].apply(lambda x:str(x)+'%')
app = []
for word in tqdm(worddf.Word):
    total_count = 0
    for tweet in text_data:
        if tweet.find(word)!= -1:
            total_count+=1
    app.append(total_count)
worddf['Appeared_On_X_Tweets'] = app
worddf

fig = go.Figure()
fig.add_trace(
    go.Table(
        header=dict(
            values=['<b>Word<b>',"<b>Accountable X% of Variance<b>",'<b>Appeared On X Synopsis<b>'],
            font=dict(size=19,family="Lato"),
            align="center"
        ),
        cells=dict(
            values=[worddf[k].tolist() for k in ['Word',"Explained Variance",'Appeared_On_X_Tweets']],
            align = "center")
    ),
    
)
del best_fearures
del app
fig.show()

<a id="5"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Generating Synopses</h3>


<a id="5.1"></a>

<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Training a LSTM Network</h3>


In [None]:

gc.collect()
synopsis = tv_df.Synopsis[:tv_df.shape[0]//3]
vocab = list(nltk.FreqDist(' '.join(synopsis).split(' ')).keys())
vocab_size = len(vocab)
word_index = { ch:i for i,ch in enumerate(vocab) }
index_word = { i:ch for i,ch in zip(word_index.values(),word_index.keys())}
FTD = ' '.join(synopsis).split(' ')

In [None]:
S = []  
C = []
stride = 10
T = 25
for i in range(0, len(FTD) - T, stride):
    S.append(FTD[i: i + T])
    C.append(FTD[i + T])
X = np.zeros((len(S), T, vocab_size), dtype='bool')
Y = np.zeros((len(S), vocab_size), dtype='bool')    
for i, seq in tqdm(enumerate(S)):
    for t, char in enumerate(seq):
        X[i, t, word_index[char]] = 1
        Y[i, word_index[C[i]]] = 1

In [None]:
lstm_nn = Sequential()
lstm_nn.add(Input((T,vocab_size)))
lstm_nn.add(LSTM(128))
lstm_nn.add(Dropout(0.2))
lstm_nn.add(Dense(vocab_size,activation='softmax'))

lstm_nn.compile(optimizer='rmsprop',loss='categorical_crossentropy')
lstm_nn.summary()

In [None]:
history = lstm_nn.fit(X, Y, epochs=500, batch_size=128,verbose=False)

In [None]:
plt.plot(history.history['loss'],'.-')
plt.ylabel('loss',fontsize=14)
plt.show()

In [None]:
generated = []
for itr in tqdm(range(0,10)):
    start = np.random.randint(0, len(X)-1)
    input_buffer = X[start] 
    generated_text = S[start].copy()

    for i in (range(100)):
        yhat = lstm_nn.predict(input_buffer[None,:])[0]
        #ix = np.argmax(yhat)
        ix = np.random.choice(range(vocab_size), p=yhat)

        ch = index_word[ix]
        generated_text += [ch]
        input_buffer = np.r_[input_buffer[1:,:], np.zeros((1,vocab_size))]
        input_buffer[-1,ix] = 1

    generated.append(' '.join(generated_text))

<a id="5.2"></a>

<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Generated Synopses</h3>


In [None]:
for dx,i in enumerate(generated):
    print('Generated Synopsis Example #',dx)
    print(i,'\n\n')