In [2]:
from pandas import read_csv

DATA_PATH = "./vgsales.csv"

def read_csv_data(data_path):
    if data_path is None or len(data_path) == 0:
        print("La ruta del archivo está vacía")
        return None
    
    try:
        df = read_csv(data_path)
        return df
    except FileNotFoundError:
        print(f"No se pudo encontrar el archivo en la ruta: {data_path}")
        return None
    except Exception as e:
        print(f"Error al leer el archivo: {e}")
        return None

def clean_data(df):
    if df is None or df.empty:
        print("El dataframe está vacío o es None")
        return None
    
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    df = df.dropna()
    df = df.drop_duplicates()
    return df

In [None]:
print(f"Intentando leer archivo desde: {DATA_PATH}")
df = read_csv_data(DATA_PATH)

if df is None:
    print("No se pudo leer el archivo")
    exit()
df = clean_data(df)
if df is None:
    print("Error al limpiar los datos")
    exit()

In [8]:
# mostrar las primera 10 filas del dataframe
df.head(10)

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


In [5]:
# mostrar las ultima 5 filas del dataframe
df.tail(5)

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.0,0.0,0.0,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.0,0.0,0.0,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.0,0.0,0.0,0.0,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.0,0.01,0.0,0.0,0.01
16597,16600,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.0,0.0,0.0,0.01


In [6]:
# utilizar metodo df.info()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16291 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   rank          16291 non-null  int64  
 1   name          16291 non-null  object 
 2   platform      16291 non-null  object 
 3   year          16291 non-null  float64
 4   genre         16291 non-null  object 
 5   publisher     16291 non-null  object 
 6   na_sales      16291 non-null  float64
 7   eu_sales      16291 non-null  float64
 8   jp_sales      16291 non-null  float64
 9   other_sales   16291 non-null  float64
 10  global_sales  16291 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.5+ MB


In [7]:
# utilizar metodo df.describe()
df.describe()

Unnamed: 0,rank,year,na_sales,eu_sales,jp_sales,other_sales,global_sales
count,16291.0,16291.0,16291.0,16291.0,16291.0,16291.0,16291.0
mean,8290.190228,2006.405561,0.265647,0.147731,0.078833,0.048426,0.54091
std,4792.65445,5.832412,0.822432,0.509303,0.311879,0.190083,1.567345
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4132.5,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8292.0,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12439.5,2010.0,0.24,0.11,0.04,0.04,0.48
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [9]:
# mostrar los tipos de datos de cada columna
df.dtypes

rank              int64
name             object
platform         object
year            float64
genre            object
publisher        object
na_sales        float64
eu_sales        float64
jp_sales        float64
other_sales     float64
global_sales    float64
dtype: object

In [11]:
# contar los valores únicos en la columna Genre utilizando el método value_counts()
df['genre'].value_counts()

genre
Action          3251
Sports          2304
Misc            1686
Role-Playing    1470
Shooter         1282
Adventure       1274
Racing          1225
Platform         875
Simulation       848
Fighting         836
Strategy         670
Puzzle           570
Name: count, dtype: int64

In [12]:
# mostrar los valores unicos de la columna 'Platform'
df['platform'].unique()

array(['Wii', 'NES', 'GB', 'DS', 'X360', 'PS3', 'PS2', 'SNES', 'GBA',
       '3DS', 'PS4', 'N64', 'PS', 'XB', 'PC', '2600', 'PSP', 'XOne', 'GC',
       'WiiU', 'GEN', 'DC', 'PSV', 'SAT', 'SCD', 'WS', 'NG', 'TG16',
       '3DO', 'GG', 'PCFX'], dtype=object)

# Filtrado

In [24]:
# Filtra el DataFrame para mostrar solo las filas donde las ventas en América del Norte (NA_Sales) sean mayores a 1 millón.
df[df['na_sales'] > 1]

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
1905,1907,Joust,2600,1982.0,Platform,Atari,1.01,0.06,0.00,0.01,1.08
1917,1919,NCAA Football 14,X360,2013.0,Sports,Electronic Arts,1.01,0.00,0.00,0.06,1.07
1953,1955,NFL Blitz,N64,1998.0,Sports,Midway Games,1.02,0.04,0.00,0.01,1.06
1954,1956,NFL Quarterback Club 98,N64,1997.0,Sports,Acclaim Entertainment,1.01,0.05,0.00,0.01,1.06


In [25]:
# Filtra el DataFrame para mostrar solo las filas donde las ventas en Japón (JP_Sales) sean menores a 0.1 millón.
df[df['jp_sales'] < 1]

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31
15,16,Kinect Adventures!,X360,2010.0,Misc,Microsoft Game Studios,14.97,4.94,0.24,1.67,21.82
16,17,Grand Theft Auto V,PS3,2013.0,Action,Take-Two Interactive,7.01,9.27,0.97,4.14,21.40
17,18,Grand Theft Auto: San Andreas,PS2,2004.0,Action,Take-Two Interactive,9.43,0.40,0.41,10.57,20.81
23,24,Grand Theft Auto V,X360,2013.0,Action,Take-Two Interactive,9.63,5.31,0.06,1.38,16.38
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [26]:
# Utilizando el método query(), filtra el DataFrame para mostrar las filas donde el género sea Action y las ventas globales (Global_Sales) sean mayores a 2 millones.
df.query('genre == "Action" and global_sales > 2')

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
16,17,Grand Theft Auto V,PS3,2013.0,Action,Take-Two Interactive,7.01,9.27,0.97,4.14,21.40
17,18,Grand Theft Auto: San Andreas,PS2,2004.0,Action,Take-Two Interactive,9.43,0.40,0.41,10.57,20.81
23,24,Grand Theft Auto V,X360,2013.0,Action,Take-Two Interactive,9.63,5.31,0.06,1.38,16.38
24,25,Grand Theft Auto: Vice City,PS2,2002.0,Action,Take-Two Interactive,8.41,5.49,0.47,1.78,16.15
38,39,Grand Theft Auto III,PS2,2001.0,Action,Take-Two Interactive,6.99,4.51,0.30,1.30,13.10
...,...,...,...,...,...,...,...,...,...,...,...
815,817,Tom Clancy's Splinter Cell: Conviction,X360,2010.0,Action,Ubisoft,1.20,0.63,0.04,0.19,2.06
817,819,The Incredibles,GBA,2004.0,Action,THQ,1.15,0.77,0.04,0.10,2.06
833,835,Peter Jackson's King Kong: The Official Game o...,PS2,2005.0,Action,Ubisoft,0.71,1.02,0.00,0.31,2.04
835,837,Need for Speed: The Run,PS3,2011.0,Action,Electronic Arts,0.58,1.06,0.03,0.37,2.04


# Slicing de datos

In [27]:
# Selecciona y muestra solo las columnas Name y Global_Sales del DataFrame.
df[['name', 'global_sales']]

Unnamed: 0,name,global_sales
0,Wii Sports,82.74
1,Super Mario Bros.,40.24
2,Mario Kart Wii,35.82
3,Wii Sports Resort,33.00
4,Pokemon Red/Pokemon Blue,31.37
...,...,...
16593,Woody Woodpecker in Crazy Castle 5,0.01
16594,Men in Black II: Alien Escape,0.01
16595,SCORE International Baja 1000: The Official Game,0.01
16596,Know How 2,0.01


In [28]:
# Utilizando loc[], selecciona y muestra las filas de la 5 a la 10 (inclusive) y las columnas Name y Genre.
df.loc[5:10, ['name', 'genre']]

Unnamed: 0,name,genre
5,Tetris,Puzzle
6,New Super Mario Bros.,Platform
7,Wii Play,Misc
8,New Super Mario Bros. Wii,Platform
9,Duck Hunt,Shooter
10,Nintendogs,Simulation


In [29]:
# Utilizando iloc[], selecciona y muestra las primeras 5 filas y las primeras 3 columnas del DataFrame.
df.iloc[:5, :3]

Unnamed: 0,rank,name,platform
0,1,Wii Sports,Wii
1,2,Super Mario Bros.,NES
2,3,Mario Kart Wii,Wii
3,4,Wii Sports Resort,Wii
4,5,Pokemon Red/Pokemon Blue,GB
