In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Modules for EDA
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
plt.style.use('fivethirtyeight')

#Modules for ML(Recommendation)
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

In [None]:
df = pd.read_csv('../input/top-100-korean-drama-mydramalist/top100_kdrama.csv')
df.shape

In [None]:
df.info()

In [None]:
synopsis = pd.read_csv('../input/top-100-korean-drama-mydramalist/top100_kdrama.csv',usecols=['Synopsis'])
synopsis.head()

In [None]:
df.head()

# **Name as separate DataFrame**

In [None]:
kdrama_names = df[['Name']]
kdrama_names.head()

# **Features using for Recommendation**

In [None]:
cols_for_recommend = ['Year of release', 'Number of Episode', 'Network', 'Duration', 'Content Rating', 'Rating']
df = df[cols_for_recommend]
df.head()

# **Feature Engineering**

# **Removing Duplicate values in Network column**

In [None]:
networks = []
[networks.append(list(set(network.replace(' ','').split(',')))[0]) for network in df['Network']]
networks[:5]

In [None]:
df['Network'] = networks
df['Network'].unique()

# **Network and Total KDramas**

In [None]:
plt.figure(figsize=(7,7))

df['Network'].value_counts().plot(kind='barh')

plt.gca().invert_yaxis()
plt.title("Networks of Kdramas.")
plt.xlabel('Frequency')
plt.show()

df['Network'].value_counts()

# **Setting OCN and ViKi networks to others because of less count**

In [None]:
df['Network'].replace(['OCN','Viki'],['Others','Others'],inplace=True)

In [None]:
plt.figure(figsize=(7,7))

df['Network'].value_counts().plot(kind='barh')

plt.gca().invert_yaxis()
plt.title("Networks of Kdramas.")
plt.xlabel('Frequency')
plt.ylabel('Network')
plt.show()

df['Network'].value_counts()

# **Duration in Minutes**

In [None]:
df['Duration'] = df['Duration'].str.replace('[A-Za-z]\D+','',regex=True)
df['Duration'].head()

In [None]:
df['Duration'] = df['Duration'].str.replace(' ','',regex=True)
df['Duration'] = pd.to_numeric(df['Duration'])
df['Duration'].head()

In [None]:
plt.figure(figsize=(7,7))
sns.histplot(data=df['Duration'])
plt.title('Duration in minutes.')
plt.show()

# **Content Rating**

In [None]:
plt.figure(figsize=(7,7))
df['Content Rating'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.title("Content Rating")
plt.show()

In [None]:
df['Content Rating'].value_counts()

# **Rating and Content Rating**

In [None]:
sns.histplot(data=df[['Rating','Content Rating']],x='Rating',hue='Content Rating')
plt.show()

In [None]:
df[['Rating']].describe()

# **One Hot Encoding**

In [None]:
df.head()

In [None]:
cols_to_encode = ['Network','Content Rating']
dummies = pd.get_dummies(df[cols_to_encode],drop_first=True)
dummies.head()

In [None]:
df.drop(cols_to_encode, axis=1,inplace=True)
df.head()

# **Feature Scaling**

In [None]:
scale = MinMaxScaler()
scalled = scale.fit_transform(df)

In [None]:
i=0
for col in df.columns:
    df[col] = scalled[:,i]
    i += 1

In [None]:
df.head()

In [None]:
new_df = pd.concat([df, dummies],axis=1)
new_df.shape

In [None]:
new_df.head()

In [None]:
kdrama_names['Name'].loc[23]='kingdom'

In [None]:
new_df.index = [drama for drama in kdrama_names['Name']]
synopsis.index = [drama for drama in kdrama_names['Name']]

In [None]:
new_df.head()

In [None]:
def getRecommendation_dramas_for(drama_name,no_of_recommend=5,get_similarity_rate=False):
    
    kn = NearestNeighbors(n_neighbors=no_of_recommend+1,metric='manhattan')
    kn.fit(new_df)
    
    distances, indices = kn.kneighbors(new_df.loc[drama_name])
    
    print(f'Similar K-Dramas for "{drama_name[0]}":')
    nearest_dramas = [kdrama_names.loc[i][0] for i in indices.flatten()][1:]
    if not get_similarity_rate:
        return nearest_dramas
    sim_rates = []
    synopsis_ = []
    for drama in nearest_dramas:
        synopsis_.append(synopsis.loc[drama][0])
        sim = cosine_similarity(new_df.loc[drama_name],[new_df.loc[drama]]).flatten()
        sim_rates.append(sim[0])
    recommended_dramas = pd.DataFrame({'Recommended Drama':nearest_dramas,'Similarity':sim_rates,'Synopsis':synopsis_})
    recommended_dramas.sort_values(by='Similarity',ascending=True)
    return recommended_dramas

# **Predicting Drama Recommendation**

In [None]:
rd1 = kdrama_names.loc[0]
rd1

In [None]:
getRecommendation_dramas_for(rd1,no_of_recommend=5)

In [None]:
rd2 = kdrama_names.loc[10]
rd2

In [None]:
getRecommendation_dramas_for(rd2,get_similarity_rate=True)

In [None]:
rd3 = kdrama_names.loc[1]
rd3

In [None]:
getRecommendation_dramas_for(rd3,get_similarity_rate=True)

In [None]:
def print_similiar_drama_Synopsis(recommended_df):
    rdf = recommended_df
    rdf_cols = rdf['Synopsis']
    dramas = rdf['Recommended Drama']
    for i in range(5):
        print(dramas[i])
        print(rdf_cols[i])
        print('\n')

In [None]:
rd4 = kdrama_names.loc[8]
rd4

In [None]:
rdf4 = getRecommendation_dramas_for(rd4,no_of_recommend=10,get_similarity_rate=True)
print_similiar_drama_Synopsis(rdf4)

In [None]:
rd5 = kdrama_names.loc[99]
rd5

In [None]:
getRecommendation_dramas_for(rd5,no_of_recommend=5,get_similarity_rate=True)

### **Similarly I have done Android App Recommendation and Amazon Book Recommendation**
### **If you're interested then click on below links:**
1. **<a href="https://www.kaggle.com/nandalald/android-app-recommendation/">Android App Recommendation</a>**
2. **<a href="https://www.kaggle.com/nandalald/amazon-books-eda-recommendation">Amazon Book Recommendation</a>**