# Pre-processing Data

Run the cell below to import necessary modules. 

In [None]:
import numpy as np
import pandas as pd
from google.colab import drive

In [None]:
prefix = '/content/drive'
from google.colab import drive
drive.mount(prefix, force_remount=True)

Mounted at /content/drive


In [None]:
df_ratings = pd.read_csv('/content/drive/My Drive/movie/ratings.csv') # Movie datasets
movies_list = pd.read_csv('/content/drive/My Drive/movie/movies.csv')
df_links = pd.read_csv('/content/drive/My Drive/movie/links.csv')
user_watched = pd.read_csv('/content/drive/My Drive/movie/ratings.csv')

In [None]:
movie_ratings = df_ratings.groupby('movieId')['rating'].mean().reset_index()
rating_movie_merge = movie_ratings.merge(movies_list, left_on = 'movieId', right_on = 'movieId')
cleaned_movie = rating_movie_merge.merge(df_links, left_on = 'movieId', right_on = 'movieId')
cleaned_movie = cleaned_movie.drop(columns = ['tmdbId', 'genres'])
cleaned_movie = cleaned_movie.rename(columns = {'rating': 'average_rating', 'movieId': 'id'})
cleaned_movie

Unnamed: 0,id,average_rating,title,imdbId
0,1,3.872470,Toy Story (1995),114709
1,2,3.401869,Jumanji (1995),113497
2,3,3.161017,Grumpier Old Men (1995),113228
3,4,2.384615,Waiting to Exhale (1995),114885
4,5,3.267857,Father of the Bride Part II (1995),113041
...,...,...,...,...
9061,161944,5.000000,The Last Brickmaker in America (2001),255313
9062,162376,4.500000,Stranger Things,4574334
9063,162542,5.000000,Rustom (2016),5165344
9064,162672,3.000000,Mohenjo Daro (2016),3859980


In [None]:
genre_df = movies_list.drop(columns = ['title'])
genre_df['genres'] = genre_df['genres'].apply(lambda x : x.split('|'))
genre_df = genre_df.genres.apply(pd.Series).merge(genre_df, right_index = True, left_index = True).drop(["genres"], axis = 1).melt(id_vars = ['movieId'], value_name = "genres").drop('variable', axis = 1).dropna()
genre_df = genre_df.sort_values(by = 'movieId')
genre_df = genre_df.rename(columns = {'movieId': 'id'})
genre_df['unique_id'] = genre_df['id'].astype(str) + "" + genre_df['genres']

switcher_movie = {
    'Western': 'Action',
    'Film-Noir': 'Historical',
    'War': 'Action',
    'Animation': '(no genres listed)',
    'IMAX': '(no genres listed)',
    'Crime': 'Mystery',
    'Documentary': 'Historical'
}

def helper_movie(x):
  if (switcher_movie.get(x) == None):
    return x
  else:
    return switcher_movie.get(x)

genre_df['genres'] = genre_df['genres'].apply(lambda x: helper_movie(x))
genre_df = genre_df.drop_duplicates()
genre_df

Unnamed: 0,id,genres,unique_id
0,1,Adventure,1Adventure
18250,1,Children,1Children
27375,1,Comedy,1Comedy
36500,1,Fantasy,1Fantasy
9125,1,(no genres listed),1Animation
...,...,...,...
36496,163056,Sci-Fi,163056Sci-Fi
18246,163056,Adventure,163056Adventure
9122,163949,Historical,163949Documentary
9123,164977,Comedy,164977Comedy


In [None]:
users_df = pd.read_csv('/content/drive/My Drive/movie/users.csv', sep = '\t')
users_df = users_df[['user_id', 'gender', 'age', 'occ_desc', 'age_desc', 'zipcode']]
users_df = users_df.rename(columns = {'occ_desc': 'occupation', 'age_desc': 'age', 'user_id': 'username'})
users_df['username'] = 'user' + users_df['username'].astype(str)
users_df['password'] = 'youaresosmartifyoufiguredoutthispassword'
users_df

Unnamed: 0,username,gender,age,occupation,age.1,zipcode,password
0,user1,F,1,K-12 student,Under 18,48067,youaresosmartifyoufiguredoutthispassword
1,user2,M,56,self-employed,56+,70072,youaresosmartifyoufiguredoutthispassword
2,user3,M,25,scientist,25-34,55117,youaresosmartifyoufiguredoutthispassword
3,user4,M,45,executive/managerial,45-49,02460,youaresosmartifyoufiguredoutthispassword
4,user5,M,25,writer,25-34,55455,youaresosmartifyoufiguredoutthispassword
...,...,...,...,...,...,...,...
6035,user6036,F,25,scientist,25-34,32603,youaresosmartifyoufiguredoutthispassword
6036,user6037,F,45,academic/educator,45-49,76006,youaresosmartifyoufiguredoutthispassword
6037,user6038,F,56,academic/educator,56+,14706,youaresosmartifyoufiguredoutthispassword
6038,user6039,F,45,other or not specified,45-49,01060,youaresosmartifyoufiguredoutthispassword


In [None]:
user_watched_df = user_watched[['userId', 'movieId']]
user_watched_df = user_watched_df.rename(columns = {'userId': 'username', 'movieId': 'id'})
user_watched_df['unique_id'] = user_watched_df['username'].astype(str) + "%" + user_watched_df['id'].astype(str)
user_watched_df

Unnamed: 0,username,id,unique_id
0,1,31,1%31
1,1,1029,1%1029
2,1,1061,1%1061
3,1,1129,1%1129
4,1,1172,1%1172
...,...,...,...
99999,671,6268,671%6268
100000,671,6269,671%6269
100001,671,6365,671%6365
100002,671,6385,671%6385


In [None]:
anime_description = pd.read_csv('/content/drive/My Drive/archive/anime.csv')
anime_list = pd.read_csv('/content/drive/My Drive/archive/anime_with_synopsis.csv')

In [None]:
description_dropped = anime_description.drop(columns = ['Genres', 'Score', 'Name'])
anime_df = anime_list.merge(description_dropped, left_on = 'MAL_ID', right_on = 'MAL_ID')
anime_df = anime_df[['MAL_ID', 'Name', 'Score', 'sypnopsis', 'Episodes', 'Type']]
anime_df = anime_df.rename(columns = {'MAL_ID': 'id', 'Name': 'title', 'Score': 'avg_rating', 'Episodes': 'num_episode'})
anime_df

Unnamed: 0,id,title,avg_rating,sypnopsis,num_episode,Type
0,1,Cowboy Bebop,8.78,"In the year 2071, humanity has colonized sever...",26,TV
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"other day, another bounty—such is the life of ...",1,Movie
2,6,Trigun,8.24,"Vash the Stampede is the man with a $$60,000,0...",26,TV
3,7,Witch Hunter Robin,7.27,ches are individuals with special powers like ...,26,TV
4,8,Bouken Ou Beet,6.98,It is the dark century and the people are suff...,52,TV
...,...,...,...,...,...,...
16209,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,No synopsis information has been added to this...,Unknown,ONA
16210,48483,Mieruko-chan,Unknown,ko is a typical high school student whose life...,Unknown,TV
16211,48488,Higurashi no Naku Koro ni Sotsu,Unknown,Sequel to Higurashi no Naku Koro ni Gou .,Unknown,TV
16212,48491,Yama no Susume: Next Summit,Unknown,New Yama no Susume anime.,Unknown,TV


In [None]:
anime_genre = anime_list[['MAL_ID', 'Genres']]
anime_genre['Genres'] = anime_genre['Genres'].apply(lambda x : x.split(', '))
anime_genre = anime_genre.Genres.apply(pd.Series).merge(anime_genre, right_index = True, left_index = True).drop(["Genres"], axis = 1).melt(id_vars = ['MAL_ID'], value_name = "Genres").drop('variable', axis = 1).dropna()
anime_genre = anime_genre.sort_values(by = 'MAL_ID')
anime_genre = anime_genre.rename(columns = {'MAL_ID': 'id'})
anime_genre

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime_genre['Genres'] = anime_genre['Genres'].apply(lambda x : x.split(', '))


Unnamed: 0,id,Genres
0,1,Action
32428,1,Comedy
48642,1,Drama
64856,1,Sci-Fi
81070,1,Space
...,...,...
32426,48491,Slice of Life
48640,48491,Comedy
16212,48491,Adventure
16213,48492,Action


In [None]:
switcher = {
    'Space':
      'Sci-Fi',
    'Shounen':
      'Action',
    'Police':
      'Mystery',
    'Magic':
      'Fantasy',
    'Supernatural':
      'Fantasy',
    'Sports':
      'Action',
    'Josei':
      'Romance',
    'Slice of Life':
      'Romance',
    'Cars':
      'Action',
    'Seinen':
      'Children',
    'Psychological':
      'Thriller',
    'Super Power':
      'Action',
    'Martial Arts':
      'Action',
    'School':
      'Children',
    'Ecchi':
      'Romance',
    'Vampire':
      'Fantasy',
    'Military':
      'Action',
    'Dementia':
      'Thriller',
    'Mecha':
      'Action',
    'Demons':
      'Fantasy',
    'Samurai':
      'Historical',
    'Game':
      'Fantasy',
    'Shoujo':
      'Drama',
    'Harem':
      'Romance',
    'Music':
      'Musical',
    'Shoujo Ai':
      'Drama',
    'Shounen Ai':
      'Action',
    'Kids':
      'Children',
    'Parody':
      'Comedy',
    'Yaoi':
      'Romance',
    'Unknown':
      '(no genres listed)'
}

def helper(x):
  if (switcher.get(x) == None):
    return x
  else:
    return switcher.get(x)

In [None]:
anime_genre['Genres'] = anime_genre['Genres'].apply(lambda x: helper(x))
anime_genre = anime_genre.drop_duplicates()
anime_genre = anime_genre.rename(columns = {'Genres': 'genre'}).reset_index(drop = True)
anime_genre['unique_id'] = anime_genre['id'].astype(str) + "%" + anime_genre['genre']

In [None]:
genre_df.genres.unique()

array(['Adventure', 'Children', 'Comedy', 'Fantasy', '(no genres listed)',
       'Romance', 'Drama', 'Thriller', 'Mystery', 'Action', 'Horror',
       'Sci-Fi', 'Historical', 'Musical'], dtype=object)

In [None]:
needed = [genre_df, anime_genre, anime_df, user_watched_df, users_df, cleaned_movie]
names = ['genre_df', 'anime_genre', 'anime_df', 'user_watched_df', 'users_df', 'cleaned_movie']
filepath = '/content/drive/My Drive/550_csvs/'
for i in range(6):
  df = needed[i]
  df.to_csv(filepath + names[i] + '.csv')