# This file will look at cleaning the dataframe containing the reviews information 

In [11]:
#Import any required dependencies 
import pandas as pd 
import numpy as np 

In [12]:
#Read in the reviews dataframe 
review_df = pd.read_csv("../data/metacritic_game_user_comments.csv")
review_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Platform,Userscore,Comment,Username
0,0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus
1,1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,Kaistlin
2,2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,Jacody
3,3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...,doodlerman
4,4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...,StevenA


### The original file is almost 1GB in size. It is especially import to remove unnecessary information from this dataframe as the timeframe for this project will not allow us to analyse all the reviews for the platforms. We can start this process by first deleting irrelevant columns 

In [15]:
#Remove the Username column 
step_one = review_df.drop(columns=["Username", "Unnamed: 0"], axis=1)
step_one.head()

Unnamed: 0,Title,Platform,Userscore,Comment
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it..."
1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...
2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...
3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...
4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...


### The next step is to shorten the dataframe so that only the platforms that are important to us are extracted 

### Unlike the other dataframes, we will create 5 new dataframes from the original. One dataframe for each platform

In [58]:
#Extract the 5 dataframes, one for each Platform of interest
XB1 = step_one.loc[step_one["Platform"].isin(["XboxOne"]), :]
PS4 = step_one.loc[step_one["Platform"].isin(["PlayStation4"]), :]
WiiU = step_one.loc[step_one["Platform"].isin(["WiiU"]), :]
PSV = step_one.loc[step_one["Platform"].isin(["PlayStationVita"]), :]
PC = step_one.loc[step_one["Platform"].isin(["PC"]), :]

In [89]:
#Select 50 random reviews from the dataframes using "df.sample()"
PC = PC.sample(n=50, random_state=1)
XB1 = XB1.sample(n=50, random_state=1)
PS4 = PS4.sample(n=50, random_state=1)
PSV = PSV.sample(n=50, random_state=1)
WiiU = WiiU.sample(n=50, random_state=1)

### random_state=1 is used so that the results can be moderated when it comes to placing the information within an AI/ML model. Without this, the random results would always be different when called upon, making analysis for the AI/ML model difficult and unreliable 

### We can now export this data as a csv to add to a SQL database

In [90]:
#Export the dataframes into the reviews folder in the cleaned data folder 
PC.to_csv("../cleaned_data/reviews/pc.csv", index=False)
PSV.to_csv("../cleaned_data/reviews/psv.csv", index=False)
PS4.to_csv("../cleaned_data/reviews/ps4.csv", index=False)
WiiU.to_csv("../cleaned_data/reviews/wiiu.csv", index=False)
XB1.to_csv("../cleaned_data/reviews/xb1.csv", index=False)