# This file will look at cleaning the dataframe containing general information of some video games 

In [21]:
#Import any required dependencies 
import pandas as pd 

In [22]:
#Import the data file and display the results
info_df = pd.read_csv("../data/metacritic_game_info.csv")
info_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,0,The Legend of Zelda: Ocarina of Time,1998,Nintendo,Action Adventure;Fantasy,Nintendo64,99,9.1,1 Player
1,1,Tony Hawk's Pro Skater 2,2000,NeversoftEntertainment,Sports;Alternative;Skateboarding,PlayStation,98,7.4,1-2
2,2,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,PlayStation3,98,7.5,1 Player
3,3,SoulCalibur,1999,Namco,Action;Fighting;3D,Dreamcast,98,8.6,1-2
4,4,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,Xbox360,98,7.9,1 Player


### We have a lot of irrelevant columns in this dataframe. We only really need the "Year", "Platform", "Metascore", and "Avg_Userscore" columns 

In [23]:
# Drop all unecessary columns 
step_one = info_df.drop(columns=['Unnamed: 0','Publisher', 'Genre', 'No_Players', 'Title'], axis=1)
step_one.head()

Unnamed: 0,Year,Platform,Metascore,Avg_Userscore
0,1998,Nintendo64,99,9.1
1,2000,PlayStation,98,7.4
2,2008,PlayStation3,98,7.5
3,1999,Dreamcast,98,8.6
4,2008,Xbox360,98,7.9


### Now a problem we noticed in "exploration.ipynb" is that all colums listed in this dataset are as "object" values. The Year, Metascore and Avg_userscore should be as int/float values. This is the next problem we must resolve 

### There must be a reason that those columns are listed as "object" Lets take a look at the unique values in the columns to see if there are any inconsistencies 

In [24]:
#Use the ".unique()" function to determine the unique values in the column 
years_unique = step_one["Year"].unique()
years_unique

array(['1998', '2000', '2008', '1999', '2007', '2010', '2014', '2013',
       '2017', '2001', '2002', '2004', '2015', '1997', '2009', '2005',
       '2011', '2006', '2003', '2018', '1996', '2012', '2016',
       'not specified', '1995'], dtype=object)

we have rows with the label "not specified" for the years column. This is not a numerical year that pertains to the study and can therefore be removed 

In [25]:
#Remove all rows where the year is listed as "not specified"
step_two = step_one.drop(step_one[step_one.Year == "not specified"].index)

In [26]:
#Convert the column to numeric values
step_two["Year"] = step_two.Year.astype(float)

In [27]:
#Repeat for the metascore column 
metascore_unique = step_two["Metascore"].unique()

#There are no strange non numeric values in this column so we can skip the step where we drop rows and go to convert the column to float64
step_two["Metascore"] = step_two.Metascore.astype(float)

In [36]:
#Reepat for the avg_userscore column
userscore_unique = step_two["Avg_Userscore"].unique()

#We have "not specified" value and "tbd" in this column. Lets drop all rows where the average userscore is "not specified"
step_two = step_two.drop(step_two[step_two.Avg_Userscore == "not specified"].index)
step_two = step_two.drop(step_two[step_two.Avg_Userscore == "tbd"].index)

#Convert the column to float64 
step_two["Avg_Userscore"] = step_two.Avg_Userscore.astype(float)

### We can now remove all the years that are not not needed for this project

In [38]:
#Extract only the rows where the year is equal to 2014,2015,2016,2017
step_three = step_two.loc[step_two['Year'].isin([2014,2015,2016,2017]), :]
step_three

Unnamed: 0,Year,Platform,Metascore,Avg_Userscore
7,2014.0,XboxOne,97.0,7.8
11,2017.0,Switch,97.0,8.5
14,2014.0,PlayStation4,97.0,8.3
17,2017.0,Switch,97.0,8.9
21,2015.0,PC,96.0,7.7
...,...,...,...,...
4987,2017.0,XboxOne,77.0,7.1
4988,2016.0,XboxOne,77.0,7.6
4989,2016.0,3DS,77.0,7.8
4993,2017.0,XboxOne,77.0,6.5


### Can now get the rows where the platform is equal to "XboxOne", "PlayStation4", "WiiU", "PC", "PlayStationVita"

In [39]:
#Extract the relevant Platform values for the rows
step_four = step_three.loc[step_three['Platform'].isin(["XboxOne", "PlayStation4", "WiiU", "PC", "PlayStationVita"]), :]
step_four

Unnamed: 0,Year,Platform,Metascore,Avg_Userscore
7,2014.0,XboxOne,97.0,7.8
14,2014.0,PlayStation4,97.0,8.3
21,2015.0,PC,96.0,7.7
22,2017.0,WiiU,96.0,8.2
55,2014.0,PlayStation4,95.0,9.1
...,...,...,...,...
4984,2017.0,XboxOne,77.0,5.8
4987,2017.0,XboxOne,77.0,7.1
4988,2016.0,XboxOne,77.0,7.6
4993,2017.0,XboxOne,77.0,6.5


### Finally check for any NaN values in the rows and if they are present, drop them from the dataframe entirely. Once this has been done, reset the index values and export the data to the "cleaned_data" folder 

In [40]:
#Check for NaN in the entire dataframe using ".isnull()" and ".any()"
step_four.isnull().values.any()

False

In [41]:
#There are no NaN values in the dataframe. The index can now be reset 
step_five = step_four.reset_index(drop=True)
step_five

Unnamed: 0,Year,Platform,Metascore,Avg_Userscore
0,2014.0,XboxOne,97.0,7.8
1,2014.0,PlayStation4,97.0,8.3
2,2015.0,PC,96.0,7.7
3,2017.0,WiiU,96.0,8.2
4,2014.0,PlayStation4,95.0,9.1
...,...,...,...,...
1032,2017.0,XboxOne,77.0,5.8
1033,2017.0,XboxOne,77.0,7.1
1034,2016.0,XboxOne,77.0,7.6
1035,2017.0,XboxOne,77.0,6.5


In [42]:
#Export the dataframe as a csv to the "cleaned_data" folder
step_five.to_csv("../cleaned_data/info.csv", index=False)